File vapoursynth-tivtc-2+2.g7abd4a3.obscpio of Package vapoursynth-plugin-tivtc

07070100000000000081A4000000000000000000000001671240C900000707000000000000000000000000000000000000002B00000000vapoursynth-tivtc-2+2.g7abd4a3/meson.buildproject('TIVTC', 'cpp',
        version: '1',
        default_options: ['cpp_std=c++17', 'buildtype=release'],
        meson_version: '>=0.46')


warnings = [
  '-Wall',
  '-Wextra',
  '-Wshadow',
  '-Wno-unused-function',
]

cflags = [
  warnings,
  '-fvisibility=hidden',
]

ldflags = [
]


host_cpu_family = host_machine.cpu_family()

if host_cpu_family.startswith('x86')
  cflags += ['-mfpmath=sse', '-msse2', '-DVS_TARGET_CPU_X86=1']
else
  error('TIVTC can be built only for x86 systems at this time.')
endif


host_system = host_machine.system()

if host_system == 'windows' or host_system == 'cygwin'
  if host_cpu_family == 'x86'
    cflags += '-mstackrealign'
    ldflags += '-Wl,--kill-at'
  endif

  # Avoid linking errors:
  # /usr/lib/gcc/x86_64-w64-mingw32/10.2.0/../../../../x86_64-w64-mingw32/bin/ld: libtivtc.dll.p/src_TFM.cpp.obj:TFM.cpp:(.text+0x9019): undefined reference to `__strcat_chk' 
  cflags += ['-U_FORTIFY_SOURCE', '-D_FORTIFY_SOURCE=0']
endif


cxx = meson.get_compiler('cpp')

cxx_id = cxx.get_id()

if cxx_id.startswith('clang')
  cflags += '-DCLANG=1'
endif

if cxx_id == 'gcc'
  cflags += '-DGCC=1'
endif


sources = [
  'src/calcCRC.cpp',
  'src/cpufeatures.cpp',
  'src/Cycle.cpp',
  'src/PluginInit.cpp',
  'src/TCommonASM.cpp',
  'src/TDecimate.cpp',
  'src/TDecimateASM.cpp',
  'src/TDecimateBlur.cpp',
  'src/TDecimateMode2.cpp',
  'src/TDecimateMode7.cpp',
  'src/TDecimateOut.cpp',
  'src/TFM.cpp',
  'src/TFMASM.cpp',
  'src/TFMD2V.cpp',
  'src/TFMPlanar.cpp',
  'src/TFMPP.cpp',
]

deps = [
  dependency('vapoursynth').partial_dependency(includes: true, compile_args: true),
]

shared_module('tivtc',
              sources,
              dependencies: deps,
              link_args: ldflags,
              cpp_args: cflags,
              install: true)
07070100000001000041ED000000000000000000000002671240C900000000000000000000000000000000000000000000002300000000vapoursynth-tivtc-2+2.g7abd4a3/src07070100000002000081A4000000000000000000000001671240C900003E4F000000000000000000000000000000000000002D00000000vapoursynth-tivtc-2+2.g7abd4a3/src/Cycle.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "Cycle.h"
//#include "avisynth.h"
#include "stdint.h"
#include <inttypes.h>
//#include <windows.h> // OutputDebugString
#include <algorithm>
#include <cstring>
#include "internal.h"

void Cycle::setFrame(int frameIn)
{
  if (frame == frameIn) return;
  clearAll();
  frame = frameIn;
  frameE = frame + length;
  cycleS = frame > 0 ? 0 : 0 - frame;
  if (cycleS > length) cycleS = length;
  cycleE = frameE <= maxFrame ? length : length - (frameE - maxFrame - 1);
  if (cycleE < 0) cycleE = 0;
  offE = length - cycleE;
  frameSO = frame + cycleS;
  frameEO = frame + cycleE;
}

void Cycle::setDecimateLow(int num)
{
  if (decSet) return;
  if (!lowSet || !mSet)
  {
    for (int x = 0; x < length; ++x) decimate[x] = decimate2[x] = -20;
    return;
  }
  for (int i = 0; i < cycleS; ++i) decimate[i] = decimate2[i] = -20;
  int ovrDec = 0;
  for (int i = cycleS; i < cycleE; ++i)
  {
    if (decimate[i] != 1) decimate[i] = decimate2[i] = 0;
    else ++ovrDec;
  }
  for (int i = std::max(cycleE, 0); i < length; ++i) decimate[i] = decimate2[i] = -20;
  const int istop = cycleE - cycleS;
  int asd = abs(sdlim);
  if (sdlim < 0)
  {
    memcpy(dect, decimate, cycleSize * sizeof(int));
    memcpy(dect2, decimate2, cycleSize * sizeof(int));
  }
  int v = 0;
mrestart:
  for (int i = 0; v < num - ovrDec && i < istop; ++i)
  {
    bool update = true;
    for (int c = std::max(cycleS, lowest[i] - asd); c <= std::min(cycleE - 1, lowest[i] + asd); ++c)
    {
      if (decimate[c] == 1)
      {
        update = false;
        break;
      }
    }
    if (update)
    {
      decimate[lowest[i]] = 1;
      int u = lowest[i];
      while (decimate2[u] == 1) ++u;
      decimate2[u] = 1;
      ++v;
    }
  }
  if (v != num - ovrDec)
  {
    int remain = 0;
    for (int i = 0; i < istop; ++i)
    {
      if (decimate[lowest[i]] != 1)
        ++remain;
    }
    if (remain > 0 && asd > 0)
    {
      if (sdlim < 0)
      {
        --asd;
        memcpy(decimate, dect, cycleSize * sizeof(int));
        memcpy(decimate2, dect2, cycleSize * sizeof(int));
        v = 0;
      }
      else asd = 0;
      goto mrestart;
    }
    throw TIVTCError("TIVTC-Cycle:  unable to mark the required number of frames " \
      "for decimation (1).");
  }
  decSet = true;
}

void Cycle::setDecimateLowP(int num)
{
  if (!lowSet || !mSet || !decSet)
  {
    for (int x = 0; x < length; ++x) decimate[x] = decimate2[x] = -20;
    return;
  }
  const int istop = cycleE - cycleS;
  int asd = abs(sdlim);
  if (sdlim < 0)
  {
    memcpy(dect, decimate, cycleSize * sizeof(int));
    memcpy(dect2, decimate2, cycleSize * sizeof(int));
  }
  int v = 0;
mrestart:
  for (int i = 0; v < num && i < istop; ++i)
  {
    bool update = true;
    for (int c = std::max(cycleS, lowest[i] - asd); c <= std::min(cycleE - 1, lowest[i] + asd); ++c)
    {
      if (decimate[c] == 1)
      {
        update = false;
        break;
      }
    }
    if (update)
    {
      decimate[lowest[i]] = 1;
      int u = lowest[i];
      while (decimate2[u] == 1) ++u;
      decimate2[u] = 1;
      ++v;
    }
  }
  if (v != num)
  {
    int remain = 0;
    for (int i = 0; i < istop; ++i)
    {
      if (decimate[lowest[i]] != 1)
        ++remain;
    }
    if (remain > 0 && asd > 0)
    {
      if (sdlim < 0)
      {
        --asd;
        memcpy(decimate, dect, cycleSize * sizeof(int));
        memcpy(decimate2, dect2, cycleSize * sizeof(int));
        v = 0;
      }
      else asd = 0;
      goto mrestart;
    }
    throw TIVTCError("TIVTC-Cycle:  unable to mark the required number of frames " \
      "for decimation (2).");
  }
}

void Cycle::setLowest(bool excludeD)
{
  if (lowSet) return;
  if (!mSet)
  {
    for (int x = 0; x < length; ++x) lowest[x] = -20;
    return;
  }
  int i, j, temp2, f = cycleS;
  uint64_t temp1;
  if (frame == 0) ++f;
  for (i = 0; i < length; ++i) lowest[i] = i;
  for (i = 0; i < length; ++i) tArray[i] = diffMetricsU[i];
  for (i = 0; i < f; ++i) tArray[i] = UINT64_MAX;
  for (i = std::max(cycleE, 0); i < length; ++i) tArray[i] = UINT64_MAX;
  if ((excludeD && decSet) || !decSet)
  {
    for (i = cycleS; i < cycleE; ++i)
    {
      if (decimate[i] == 1) tArray[i] = UINT64_MAX;
    }
  }
  for (i = 1; i < length; ++i)
  {
    j = i;
    temp1 = tArray[j];
    temp2 = lowest[j];
    while (j > 0 && tArray[j - 1] > temp1)
    {
      tArray[j] = tArray[j - 1];
      lowest[j] = lowest[j - 1];
      --j;
    }
    tArray[j] = temp1;
    lowest[j] = temp2;
  }
  lowSet = true;
}

void Cycle::setDups(double thresh)
{
  if (dupsSet) return;
  if (!mSet)
  {
    for (int x = 0; x < length; ++x) dupArray[x] = -20;
    dupCount = -20;
    return;
  }
  int i;
  dupCount = 0;
  for (i = 0; i < cycleS; ++i) dupArray[i] = -20;
  for (i = cycleS; i < cycleE; ++i)
  {
    if (diffMetricsN[i] <= thresh)
    {
      dupArray[i] = 1;
      ++dupCount;
    }
    else dupArray[i] = 0;
  }
  for (i = std::max(cycleE, 0); i < length; ++i) dupArray[i] = -20;
  if (frame == 0 && dupArray[cycleS] == 1)
  {
    --dupCount;
    dupArray[cycleS] = 0;
  }
  dupsSet = true;
}

void Cycle::setDupsMatches(Cycle &p, const std::vector<uint8_t> &marray)
{
  if (dupsSet) return;
  bool skip = false;
  int i, mp, mc;
  for (i = cycleS; i < cycleE; ++i)
  {
    if (match[i] < 0 || match[i] > 6) skip = true;
  }
  if (skip)
  {
    for (i = 0; i < length; ++i) dupArray[i] = -20;
    dupCount = -20;
    return;
  }
  dupCount = 0;
  for (i = 0; i < cycleS; ++i) dupArray[i] = -20;
  mp = (p.cycleE > 0 && p.frame != frame) ? p.match[p.cycleE - 1] : -20;
  if (mp == -20 && marray.size())
  {
    int n = (p.frame == frame) ? frameSO - 1 : p.cycleE - 1;
    if (!(n < 0 || n > maxFrame || (n >= frameSO && n < frameEO) || n != frameSO - 1))
    {
      int value = marray[n];
      if ((value&ISMATCH) != 0x70)
      {
        value = (value&ISMATCH) >> 4;
        if (value == ISC) mp = ISC;
        else if (value == ISP) mp = ISP;
        else if (value == ISN) mp = ISN;
        else if (value == ISB) mp = ISB;
        else if (value == ISU) mp = ISU;
        else if (value == ISDB) mp = ISDB;
        else if (value == ISDT) mp = ISDT;
      }
    }
  }
  mc = match[cycleS];
  for (i = cycleS; i < cycleE; ++i)
  {
    if (checkMatchDup(mp, mc))
    {
      dupArray[i] = 1;
      ++dupCount;
    }
    else dupArray[i] = 0;
    mp = mc;
    if (i < cycleE - 1) mc = match[i + 1];
  }
  for (i = std::max(cycleE, 0); i < length; ++i) dupArray[i] = -20;
  dupsSet = true;
}

void Cycle::setIsFilmD2V()
{
  isfilmd2v = false;
  for (int i = cycleS; i < cycleE; ++i)
  {
    if (filmd2v[i] == 1)
    {
      isfilmd2v = true;
      return;
    }
  }
}

bool Cycle::checkMatchDup(int mp, int mc)
{
  if (mp == 0 && mc == 3) return true;
  else if (mp == 1 && (mc == 0 || mc == 3)) return true;
  else if (mp == 2 && (mc == 1 || mc == 3 || mc == 4 || mc == 6)) return true;
  else if (mp == 3 && mc == 0) return true;
  else if (mp == 4 && (mc == 0 || mc == 1 || mc == 2 || mc == 5)) return true;
  else if (mp == 5 && mc == 3) return true;
  else if (mp == 6 && mc == 0) return true;
  else if (mc < 0) return true;
  return false;
}

int Cycle::getNonDec(int n)
{
  if (!decSet) return -1;
  int i, count, ret;
  for (count = -1, ret = -1, i = cycleS; i < cycleE; ++i)
  {
    if (decimate[i] == 0) ++count;
    if (count == n) { ret = i; break; }
  }
  return ret;
}

void Cycle::clearAll()
{
  mSet = lowSet = dupsSet = decSet = isfilmd2v = false;
  frame = frameE = cycleS = cycleE = offE = -20;
  frameSO = frameEO = dupCount = blend = -20;
  type = -1;
  for (int x = 0; x < length; ++x)
  {
    dupArray[x] = lowest[x] = decimate[x] = match[x] = decimate2[x] = filmd2v[x] = -20;
    diffMetricsU[x] = diffMetricsUF[x] = UINT64_MAX;
    diffMetricsN[x] = -20.0;
  }
}

int Cycle::sceneDetect(uint64_t thresh)
{
  if (!mSet) return -20;
  int i, f, v;
  for (f = 0, v = -1, i = cycleS; i < cycleE; ++i)
  {
    if (diffMetricsUF[i] > thresh)
    {
      ++f;
      v = i;
    }
  }
  if (f == 1)
  {
    if (v > 0) return v - 1;
    else return v;
  }
  return -20;
}

int Cycle::sceneDetect(Cycle &prev, Cycle &next, uint64_t thresh)
{
  if (!mSet || !prev.mSet || !next.mSet) return -20;
  int i, f, v;
  if (length > 10) return sceneDetect(thresh);
  for (v = prev.cycleS; v < prev.cycleE; ++v)
  {
    if (prev.diffMetricsUF[v] > thresh) return -20;
  }
  for (v = next.cycleS; v < next.cycleE; ++v)
  {
    if (next.diffMetricsUF[v] > thresh) return -20;
  }
  for (f = 0, v = -1, i = cycleS; i < cycleE; ++i)
  {
    if (diffMetricsUF[i] > thresh)
    {
      ++f;
      v = i;
    }
  }
  if (f == 1)
  {
    if (v > 0) return v - 1;
    else return v;
  }
  return -20;
}

void Cycle::debugOutput()
{
//  char temp[256];
//  sprintf(temp, "Cycle:  length = %d  maxFrame = %d  size = %d\n", length, maxFrame, cycleSize);
//  OutputDebugString(temp);
//  sprintf(temp, "Cycle:  frame = %d   frameE = %d\n", frame, frameE);
//  OutputDebugString(temp);
//  sprintf(temp, "Cycle:  cycleS = %d  cycleE = %d\n", cycleS, cycleE);
//  OutputDebugString(temp);
//  sprintf(temp, "Cycle:  frameSO = %d frameEO = %d\n", frameSO, frameEO);
//  OutputDebugString(temp);
//  sprintf(temp, "Cycle:  offE = %d    type = %d  blend = %d  dupCount = %d\n", offE, type, blend, dupCount);
//  OutputDebugString(temp);
//  sprintf(temp, "Cycle:  dupSet = %c  mSet = %c  lowSet = %c  decSet = %c  isfilmd2v = %c\n",
//    dupsSet ? 'T' : 'F', mSet ? 'T' : 'F', lowSet ? 'T' : 'F', decSet ? 'T' : 'F',
//    isfilmd2v ? 'T' : 'F');
//  OutputDebugString(temp);
}

void Cycle::debugMetrics(int _length)
{
//  char temp[256];
//  for (int x = 0; x < _length; ++x)
//  {
//    sprintf(temp, "Cycle:  %d - %3.2f  %" PRIu64 "  %" PRIu64 "\n", x, diffMetricsN[x], diffMetricsU[x],
//      diffMetricsUF[x]);
//    OutputDebugString(temp);
//    sprintf(temp, "Cycle:  %d - dup = %d  lowest = %d  decimate = %d  decimate2 = %d  match = %d  filmd2v = %d\n", x,
//      dupArray[x], lowest[x], decimate[x], decimate2[x], match[x], filmd2v[x]);
//    OutputDebugString(temp);
//  }
}

Cycle::Cycle(int _size, int _sdlim)
{
  mSet = lowSet = dupsSet = decSet = isfilmd2v = false;
  length = frame = frameE = cycleS = cycleE = offE = -20;
  frameSO = frameEO = maxFrame = dupCount = blend = -20;
  type = -1;
  dupArray = lowest = match = decimate = decimate2 = filmd2v = nullptr;
  dect = dect2 = nullptr;
  diffMetricsU = diffMetricsUF = tArray = nullptr;
  diffMetricsN = nullptr;
  cycleSize = std::max(0, _size);
  sdlim = _sdlim;
  allocSpace();
  for (int x = 0; x < cycleSize; ++x)
  {
    dupArray[x] = lowest[x] = decimate[x] = match[x] = decimate2[x] = filmd2v[x] = -20;
    diffMetricsU[x] = diffMetricsUF[x] = tArray[x] = UINT64_MAX;
    diffMetricsN[x] = -20.0;
  }
}

Cycle::~Cycle()
{
  if (dupArray != nullptr) { free(dupArray); dupArray = nullptr; }
  if (lowest != nullptr) { free(lowest); lowest = nullptr; }
  if (match != nullptr) { free(match); match = nullptr; }
  if (filmd2v != nullptr) { free(filmd2v); filmd2v = nullptr; }
  if (decimate != nullptr) { free(decimate); decimate = nullptr; }
  if (decimate2 != nullptr) { free(decimate2); decimate2 = nullptr; }
  if (dect != nullptr) { free(dect); dect = nullptr; }
  if (dect2 != nullptr) { free(dect2); dect2 = nullptr; }
  if (diffMetricsU != nullptr) { free(diffMetricsU); diffMetricsU = nullptr; }
  if (diffMetricsUF != nullptr) { free(diffMetricsUF); diffMetricsUF = nullptr; }
  if (tArray != nullptr) { free(tArray); tArray = nullptr; }
  if (diffMetricsN != nullptr) { free(diffMetricsN); diffMetricsN = nullptr; }
}

bool Cycle::allocSpace()
{
  if (dupArray != nullptr) { free(dupArray); dupArray = nullptr; }
  if (lowest != nullptr) { free(lowest); lowest = nullptr; }
  if (match != nullptr) { free(match); match = nullptr; }
  if (filmd2v != nullptr) { free(filmd2v); filmd2v = nullptr; }
  if (decimate != nullptr) { free(decimate); decimate = nullptr; }
  if (decimate2 != nullptr) { free(decimate2); decimate2 = nullptr; }
  if (dect != nullptr) { free(dect); dect = nullptr; }
  if (dect2 != nullptr) { free(dect2); dect2 = nullptr; }
  if (diffMetricsU != nullptr) { free(diffMetricsU); diffMetricsU = nullptr; }
  if (diffMetricsUF != nullptr) { free(diffMetricsUF); diffMetricsUF = nullptr; }
  if (tArray != nullptr) { free(tArray); tArray = nullptr; }
  if (diffMetricsN != nullptr) { free(diffMetricsN); diffMetricsN = nullptr; }
  dupArray = (int *)malloc(cycleSize * sizeof(int));
  lowest = (int *)malloc(cycleSize * sizeof(int));
  match = (int *)malloc(cycleSize * sizeof(int));
  filmd2v = (int *)malloc(cycleSize * sizeof(int));
  decimate = (int *)malloc(cycleSize * sizeof(int));
  decimate2 = (int *)malloc(cycleSize * sizeof(int));
  dect = (int *)malloc(cycleSize * sizeof(int));
  dect2 = (int *)malloc(cycleSize * sizeof(int));
  diffMetricsU = (uint64_t *)malloc(cycleSize * sizeof(uint64_t));
  diffMetricsUF = (uint64_t *)malloc(cycleSize * sizeof(uint64_t));
  tArray = (uint64_t *)malloc(cycleSize * sizeof(uint64_t));
  diffMetricsN = (double *)malloc(cycleSize * sizeof(double));
  if (dupArray == nullptr || lowest == nullptr || match == nullptr || filmd2v == nullptr ||
    decimate == nullptr || decimate2 == nullptr || diffMetricsU == nullptr ||
    diffMetricsUF == nullptr || diffMetricsN == nullptr || tArray == nullptr ||
    dect == nullptr || dect2 == nullptr) return false;
  return true;
}

void Cycle::setSize(int _size)
{
  cycleSize = std::max(0, _size);
  allocSpace();
  for (int x = 0; x < cycleSize; ++x)
  {
    dupArray[x] = lowest[x] = decimate[x] = match[x] = decimate2[x] = filmd2v[x] = -20;
    diffMetricsU[x] = diffMetricsUF[x] = tArray[x] = UINT64_MAX;
    diffMetricsN[x] = -20.0;
  }
}

Cycle& Cycle::operator=(Cycle& ob2)
{
  length = ob2.length;
  maxFrame = ob2.maxFrame;
  frame = ob2.frame;
  frameE = ob2.frameE;
  offE = ob2.offE;
  cycleS = ob2.cycleS;
  cycleE = ob2.cycleE;
  frameSO = ob2.frameSO;
  frameEO = ob2.frameEO;
  type = ob2.type;
  dupsSet = ob2.dupsSet;
  mSet = ob2.mSet;
  lowSet = ob2.lowSet;
  decSet = ob2.decSet;
  dupCount = ob2.dupCount;
  blend = ob2.blend;
  isfilmd2v = ob2.isfilmd2v;
  cycleSize = std::min(cycleSize, ob2.cycleSize);
  if (length > cycleSize) length = cycleSize;
  memcpy(dupArray, ob2.dupArray, cycleSize * sizeof(int));
  memcpy(lowest, ob2.lowest, cycleSize * sizeof(int));
  memcpy(match, ob2.match, cycleSize * sizeof(int));
  memcpy(filmd2v, ob2.filmd2v, cycleSize * sizeof(int));
  memcpy(decimate, ob2.decimate, cycleSize * sizeof(int));
  memcpy(decimate2, ob2.decimate2, cycleSize * sizeof(int));
  memcpy(diffMetricsU, ob2.diffMetricsU, cycleSize * sizeof(uint64_t));
  memcpy(diffMetricsUF, ob2.diffMetricsUF, cycleSize * sizeof(uint64_t));
  memcpy(diffMetricsN, ob2.diffMetricsN, cycleSize * sizeof(double));
  return *this;
}
07070100000003000081A4000000000000000000000001671240C900000EC7000000000000000000000000000000000000002B00000000vapoursynth-tivtc-2+2.g7abd4a3/src/Cycle.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#ifndef CYCLE_H
#define CYCLE_H

/*
** This class stores all the individual cycle
** info for TDecimate and provides some useful methods.
**
** For all of this class setting an int to -20 = nothing
** (not set), except for type where -1 = nothing
**
**		  VIDEO TYPES
**	-1 = nothing (not set)
**   0 = film
**   1 = film by ovr
**   2 = video by matches
**   3 = video by metrics
**   4 = video by matches/metrics
**   5 = video by ovr
**
**        Blend Codes
**  -20 = not set
**    0 = no blending
**    1 = cvr - blend video cycle down
**    2 = cvr - video cycle w/ scenechange
**    3 = cvr/vfr - 2 dup cycle workaround
**
*/

#include <stdio.h>
#include <limits.h>
//#include "profUtil.h"
#include "stdint.h"
#include <vector>

class Cycle
{
private:
  int cycleSize;
  bool allocSpace();
  bool checkMatchDup(int mp, int mc);

public:
  int sdlim;
  int length;		// length of cycle
  int maxFrame;	// nfrms
  int frame;		// first frame of cycle
  int frameE;		// last frame of cycle (frame + length)
  int offE;		// end offset
  int cycleS;		// 0 + start offset
  int cycleE;		// length - offE
  int frameSO;	// frame + cycleS
  int frameEO;	// frame + cycleE
  int type;		// video or film and how
  double *diffMetricsN;			// normalized metrics
  uint64_t *diffMetricsU;	// unnormalized metrics
  uint64_t *diffMetricsUF;	// frame metrics (scenechange detection)
  uint64_t *tArray;			// used as temp storage when sorting
  int *dupArray;	// duplicate marking
  int *lowest;	// sorted list of metrics
  int *decimate;	// position of frames to drop
  int *decimate2;	// needed for some parts of longest string decimation
  int *match;		// frame matches (used for 30p identification)
  int *filmd2v;	// d2v trf flags indicate duplicate
  bool dupsSet;	// dups set
  bool mSet;		// metrics set
  bool lowSet;	// list sorted
  bool decSet;	// decimate array filled in
  bool isfilmd2v;	// d2v indicates duplicate in cycle
  int dupCount;	// tracks # of dups for longest string decimation
  int blend;		// 0, 1 (blending), 2 (mkv), others are hijacked for special handling
  int *dect, *dect2;

  void setFrame(int frameIn);
  void setDecimateLow(int num);
  void setLowest(bool exludeD);
  void setDups(double thresh);
  void setDupsMatches(Cycle &p, const std::vector<uint8_t> &marray);
  void setDecimateLowP(int num);
  void setIsFilmD2V();
  int sceneDetect(uint64_t thresh);
  int sceneDetect(Cycle &prev, Cycle &next, uint64_t thresh);
  int getNonDec(int n);
  void clearAll();
  void debugOutput();
  void debugMetrics(int length);

  Cycle(int _size, int _sdlim);
  void setSize(int _size);
  ~Cycle();
  Cycle& operator=(Cycle& ob2);
};

#endif // CYCLE_H
07070100000004000081A4000000000000000000000001671240C9000058C2000000000000000000000000000000000000003200000000vapoursynth-tivtc-2+2.g7abd4a3/src/PluginInit.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <cstdint>
#include <cstdlib>
#include <cstring>

#include <VapourSynth.h>
#include <VSHelper.h>

#include "TFM.h"
#include "TFMPP.h"
#include "TDecimate.h"


static void VS_CC tfmInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
    (void)in;
    (void)out;
    (void)core;

    TFM *d = (TFM *) *instanceData;

    vsapi->setVideoInfo(d->vi, 1, node);
}


static const VSFrameRef *VS_CC tfmGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
    (void)frameData;
    (void)vsapi;

    TFM *d = (TFM *) *instanceData;

    return d->GetFrame(n, activationReason, frameCtx, core);
}


static void VS_CC tfmFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
    (void)core;
    (void)vsapi;

    TFM *d = (TFM *)instanceData;

    delete d;
}


static void VS_CC tfmppInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
    (void)in;
    (void)out;
    (void)core;

    TFMPP *d = (TFMPP *) *instanceData;

    vsapi->setVideoInfo(d->vi, 1, node);
}


static const VSFrameRef *VS_CC tfmppGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
    (void)frameData;
    (void)vsapi;

    TFMPP *d = (TFMPP *) *instanceData;

    return d->GetFrame(n, activationReason, frameCtx, core);
}


static void VS_CC tfmppFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
    (void)core;
    (void)vsapi;

    TFMPP *d = (TFMPP *)instanceData;

    delete d;
}


enum DisplayFilters {
    DisplayTFM,
    DisplayTDecimate
};

template <DisplayFilters filter>
static void VS_CC tivtcDisplayFunc(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
    VSNodeRef *clip = (VSNodeRef *)userData;

    const char *display_prop = filter == DisplayTFM ? PROP_TFMDisplay : PROP_TDecimateDisplay;

    const VSFrameRef *f = vsapi->propGetFrame(in, "f", 0, nullptr);
    const VSMap *props = vsapi->getFramePropsRO(f);
    const char *text = vsapi->propGetData(props, display_prop, 0, nullptr);
    int text_size = vsapi->propGetDataSize(props, display_prop, 0, nullptr);

    VSMap *params = vsapi->createMap();
    vsapi->propSetNode(params, "clip", clip, paReplace); // clip is freed by vapoursynth somewhere. We don't free it here.
    vsapi->propSetData(params, "text", text, text_size, paReplace);
    vsapi->freeFrame(f);

    VSPlugin *text_plugin = vsapi->getPluginById("com.vapoursynth.text", core);
    VSMap *ret = vsapi->invoke(text_plugin, "Text", params);
    vsapi->freeMap(params);
    if (vsapi->getError(ret)) {
        char error[512] = { 0 };
        snprintf(error, 512, "%s: failed to invoke text.Text: %s", filter == DisplayTFM ? "TFM" : "TDecimate", vsapi->getError(ret));
        vsapi->freeMap(ret);
        vsapi->setError(out, error);
        return;
    }
    clip = vsapi->propGetNode(ret, "clip", 0, nullptr);
    vsapi->freeMap(ret);
    vsapi->propSetNode(out, "val", clip, paReplace);
    vsapi->freeNode(clip);
}


static void VS_CC tfmCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
    (void)userData;

    int err;

    int order = int64ToIntS(vsapi->propGetInt(in, "order", 0, &err));
    if (err)
        order = -1;

    int field = int64ToIntS(vsapi->propGetInt(in, "field", 0, &err));
    if (err)
        field = -1;

    int mode = int64ToIntS(vsapi->propGetInt(in, "mode", 0, &err));
    if (err)
        mode = 1;

    int PP = int64ToIntS(vsapi->propGetInt(in, "PP", 0, &err));
    if (err)
        PP = 6;

    const char *ovr = vsapi->propGetData(in, "ovr", 0, &err);
    if (err)
        ovr = "";

    const char *input = vsapi->propGetData(in, "input", 0, &err);
    if (err)
        input = "";

    const char *output = vsapi->propGetData(in, "output", 0, &err);
    if (err)
        output = "";

    const char *outputC = vsapi->propGetData(in, "outputC", 0, &err);
    if (err)
        outputC = "";

    bool debug = !!vsapi->propGetInt(in, "debug", 0, &err); /// not used for anything at the moment. maybe use logMessage ?
    if (err)
        debug = false;

    bool display = !!vsapi->propGetInt(in, "display", 0, &err);
    if (err)
        display = false;

    int slow = int64ToIntS(vsapi->propGetInt(in, "slow", 0, &err));
    if (err)
        slow = 1;

    bool mChroma = !!vsapi->propGetInt(in, "mChroma", 0, &err);
    if (err)
        mChroma = true;

    int cNum = int64ToIntS(vsapi->propGetInt(in, "cNum", 0, &err));
    if (err)
        cNum = 15;

    int cthresh = int64ToIntS(vsapi->propGetInt(in, "cthresh", 0, &err));
    if (err)
        cthresh = 9;

    int MI = int64ToIntS(vsapi->propGetInt(in, "MI", 0, &err));
    if (err)
        MI = 80;

    bool chroma = !!vsapi->propGetInt(in, "chroma", 0, &err);
    if (err)
        chroma = false;

    int blockx = int64ToIntS(vsapi->propGetInt(in, "blockx", 0, &err));
    if (err)
        blockx = 16;

    int blocky = int64ToIntS(vsapi->propGetInt(in, "blocky", 0, &err));
    if (err)
        blocky = 16;

    int y0 = int64ToIntS(vsapi->propGetInt(in, "y0", 0, &err));
    if (err)
        y0 = 0;

    int y1 = int64ToIntS(vsapi->propGetInt(in, "y1", 0, &err));
    if (err)
        y1 = 0;

    int mthresh = int64ToIntS(vsapi->propGetInt(in, "mthresh", 0, &err));
    if (err)
        mthresh = 5;

    const char *d2v = vsapi->propGetData(in, "d2v", 0, &err);
    if (err)
        d2v = "";

    int ovrDefault = int64ToIntS(vsapi->propGetInt(in, "ovrDefault", 0, &err));
    if (err)
        ovrDefault = 0;

    int flags = int64ToIntS(vsapi->propGetInt(in, "flags", 0, &err));
    if (err)
        flags = 4;

    double scthresh = vsapi->propGetFloat(in, "scthresh", 0, &err);
    if (err)
        scthresh = 12.0;

    int micout = int64ToIntS(vsapi->propGetInt(in, "micout", 0, &err));
    if (err)
        micout = 0;

    int micmatching = int64ToIntS(vsapi->propGetInt(in, "micmatching", 0, &err));
    if (err)
        micmatching = 1;

    const char *trimIn = vsapi->propGetData(in, "trimIn", 0, &err);
    if (err)
        trimIn = "";

    bool hint = !!vsapi->propGetInt(in, "hint", 0, &err);
    if (err)
        hint = true;

    int metric = int64ToIntS(vsapi->propGetInt(in, "metric", 0, &err));
    if (err)
        metric = 0;

    bool batch = !!vsapi->propGetInt(in, "batch", 0, &err);
    if (err)
        batch = false;

    bool ubsco = !!vsapi->propGetInt(in, "ubsco", 0, &err);
    if (err)
        ubsco = true;

    bool mmsco = !!vsapi->propGetInt(in, "mmsco", 0, &err);
    if (err)
        mmsco = true;

    int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
    if (err)
        opt = 4;


    VSNodeRef *clip = vsapi->propGetNode(in, "clip", 0, nullptr);

    TFM *tfm_data;

    try {
        tfm_data = new TFM(clip, order, field, mode, PP, ovr, input, output, outputC, debug, display, slow, mChroma, cNum, cthresh,
                       MI, chroma, blockx, blocky, y0, y1, d2v, ovrDefault, flags, scthresh, micout, micmatching, trimIn, hint,
                       metric, batch, ubsco, mmsco, opt, vsapi, core);
    } catch (const TIVTCError& e) {
        vsapi->setError(out, e.what());

        vsapi->freeNode(clip);

        return;
    }

    int filter_mode = fmParallelRequests; /// It's possible fmParallel could be used in some situations. Study the matter.
    int filter_flags = 0;
    if (mode == 7) {
        // mode 7 requires linear access to function correctly.
        filter_mode = fmSerial;
        filter_flags = nfMakeLinear;
    }
    vsapi->createFilter(in, out, "TFM", tfmInit, tfmGetFrame, tfmFree, filter_mode, filter_flags, tfm_data, core);

    if (vsapi->getError(out))
        return;


    if (PP > 4) {
        VSMap *params = vsapi->createMap();
        VSNodeRef *node = vsapi->propGetNode(out, "clip", 0, nullptr);
        vsapi->propSetNode(params, "clip", node, paReplace);
        vsapi->freeNode(node);
        VSPlugin *std_plugin = vsapi->getPluginById("com.vapoursynth.std", core);
        VSMap *ret = vsapi->invoke(std_plugin, "Cache", params);
        vsapi->freeMap(params);
        if (vsapi->getError(ret)) {
            char error[512] = { 0 };
            snprintf(error, 512, "TFM: failed to invoke std.Cache: %s", vsapi->getError(ret));
            vsapi->freeMap(ret);
            vsapi->setError(out, error);
            return;
        }
        node = vsapi->propGetNode(ret, "clip", 0, nullptr);
        vsapi->freeMap(ret);
        vsapi->propSetNode(out, "clip", node, paReplace);
        vsapi->freeNode(node);
    }

    if (PP > 1) {
        VSNodeRef *clip2 = vsapi->propGetNode(in, "clip2", 0, &err);

        VSNodeRef *node = vsapi->propGetNode(out, "clip", 0, nullptr);

        TFMPP *tfmpp_data;

        try {
            tfmpp_data = new TFMPP(node, PP, mthresh, ovr, display, clip2, hint, opt, vsapi, core);
        } catch (const TIVTCError& e) {
            vsapi->setError(out, e.what());

            vsapi->freeNode(node);
            vsapi->freeNode(clip2);

            return;
        }

        // createFilter uses paAppend when adding the node to the "out" map, so clear the existing node first.
        vsapi->propDeleteKey(out, "clip");

        vsapi->createFilter(in, out, "TFMPP", tfmppInit, tfmppGetFrame, tfmppFree, fmParallelRequests, 0, tfmpp_data, core);
    }

    if (display) {
        // text.FrameProps won't print the TFMDisplay property because it is too long,
        // so we use text.Text with std.FrameEval instead.
        VSMap *params = vsapi->createMap();
        VSNodeRef *node = vsapi->propGetNode(out, "clip", 0, nullptr);
        vsapi->propSetNode(params, "clip", node, paReplace);
        vsapi->propSetNode(params, "prop_src", node, paReplace);
        VSFuncRef *displayFuncRef = vsapi->createFunc(tivtcDisplayFunc<DisplayTFM>, vsapi->cloneNodeRef(node), (VSFreeFuncData)vsapi->freeNode, core, vsapi);
        vsapi->freeNode(node);
        vsapi->propSetFunc(params, "eval", displayFuncRef, paReplace);
        vsapi->freeFunc(displayFuncRef);
        VSPlugin *std_plugin = vsapi->getPluginById("com.vapoursynth.std", core);
        VSMap *ret = vsapi->invoke(std_plugin, "FrameEval", params);
        vsapi->freeMap(params);
        if (vsapi->getError(ret)) {
            char error[512] = { 0 };
            snprintf(error, 512, "TFM: failed to invoke std.FrameEval: %s", vsapi->getError(ret));
            vsapi->freeMap(ret);
            vsapi->setError(out, error);
            return;
        }
        node = vsapi->propGetNode(ret, "clip", 0, nullptr);
        vsapi->freeMap(ret);
        vsapi->propSetNode(out, "clip", node, paReplace);
        vsapi->freeNode(node);
    }
}


static void VS_CC tdecimateInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
    (void)in;
    (void)out;
    (void)core;

    TDecimate *d = (TDecimate *) *instanceData;

    vsapi->setVideoInfo(&d->vi, 1, node);
}


static const VSFrameRef *VS_CC tdecimateGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
    (void)vsapi;

    TDecimate *d = (TDecimate *) *instanceData;

    return d->GetFrame(n, activationReason, frameData, frameCtx, core);
}


static void VS_CC tdecimateFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
    (void)core;
    (void)vsapi;

    TDecimate *d = (TDecimate *)instanceData;

    delete d;
}


static void VS_CC tdecimateCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
    (void)userData;

    int err;

    VSNodeRef *clip = vsapi->propGetNode(in, "clip", 0, nullptr); /// move lower if possible

    int mode = int64ToIntS(vsapi->propGetInt(in, "mode", 0, &err));
    if (err)
        mode = 0;

    int cycleR = int64ToIntS(vsapi->propGetInt(in, "cycleR", 0, &err));
    if (err)
        cycleR = 1;

    int cycle = int64ToIntS(vsapi->propGetInt(in, "cycle", 0, &err));
    if (err)
        cycle = 5;

    double rate = vsapi->propGetFloat(in, "rate", 0, &err);
    if (err)
        rate = 23.976;

    bool chroma = !!vsapi->propGetInt(in, "chroma", 0, &err);
    if (err)
        chroma = true;

    {
        const VSVideoInfo *vi = vsapi->getVideoInfo(clip);
        if (vi->format && vi->format->colorFamily == cmGray)
            chroma = false;
    }

    double dupThresh = vsapi->propGetFloat(in, "dupThresh", 0, &err);
    if (err)
        dupThresh = mode == 7 ? (chroma ? 0.4 : 0.5)
                              : (chroma ? 1.1 : 1.4);

    double vidThresh = vsapi->propGetFloat(in, "vidThresh", 0, &err);
    if (err)
        vidThresh = mode == 7 ? (chroma ? 3.5 : 4.0)
                              : (chroma ? 1.1 : 1.4);

    double sceneThresh = vsapi->propGetFloat(in, "sceneThresh", 0, &err);
    if (err)
        sceneThresh = 15;

    int hybrid = int64ToIntS(vsapi->propGetInt(in, "hybrid", 0, &err));
    if (err)
        hybrid = 0;

    int vidDetect = int64ToIntS(vsapi->propGetInt(in, "vidDetect", 0, &err));
    if (err)
        vidDetect = 3;

    int conCycle = int64ToIntS(vsapi->propGetInt(in, "conCycle", 0, &err));
    if (err)
        conCycle = vidDetect >= 3 ? 1 : 2;

    int conCycleTP = int64ToIntS(vsapi->propGetInt(in, "conCycleTP", 0, &err));
    if (err)
        conCycleTP = vidDetect >= 3 ? 1 : 2;

    const char *ovr = vsapi->propGetData(in, "ovr", 0, &err);
    if (err)
        ovr = "";

    const char *output = vsapi->propGetData(in, "output", 0, &err);
    if (err)
        output = "";

    const char *input = vsapi->propGetData(in, "input", 0, &err);
    if (err)
        input = "";

    const char *tfmIn = vsapi->propGetData(in, "tfmIn", 0, &err);
    if (err)
        tfmIn = "";

    const char *mkvOut = vsapi->propGetData(in, "mkvOut", 0, &err);
    if (err)
        mkvOut = "";

    int nt = int64ToIntS(vsapi->propGetInt(in, "nt", 0, &err));
    if (err)
        nt = 0;

    int blockx = int64ToIntS(vsapi->propGetInt(in, "blockx", 0, &err));
    if (err)
        blockx = 32;

    int blocky = int64ToIntS(vsapi->propGetInt(in, "blocky", 0, &err));
    if (err)
        blocky = 32;

    bool debug = !!vsapi->propGetInt(in, "debug", 0, &err);
    if (err)
        debug = false;

    bool display = !!vsapi->propGetInt(in, "display", 0, &err);
    if (err)
        display = false;

    int vfrDec = int64ToIntS(vsapi->propGetInt(in, "vfrDec", 0, &err));
    if (err)
        vfrDec = 1;

    bool batch = !!vsapi->propGetInt(in, "batch", 0, &err);
    if (err)
        batch = false;

    bool tcfv1 = !!vsapi->propGetInt(in, "tcfv1", 0, &err);
    if (err)
        tcfv1 = true;

    bool se = !!vsapi->propGetInt(in, "se", 0, &err);
    if (err)
        se = false;

    bool exPP = !!vsapi->propGetInt(in, "exPP", 0, &err);
    if (err)
        exPP = false;

    int maxndl = int64ToIntS(vsapi->propGetInt(in, "maxndl", 0, &err));
    if (err)
        maxndl = -200;

    bool m2PA = !!vsapi->propGetInt(in, "m2PA", 0, &err);
    if (err)
        m2PA = false;

    bool denoise = !!vsapi->propGetInt(in, "denoise", 0, &err);
    if (err)
        denoise = false;

    bool noblend = !!vsapi->propGetInt(in, "noblend", 0, &err);
    if (err)
        noblend = true;

    bool ssd = !!vsapi->propGetInt(in, "ssd", 0, &err);
    if (err)
        ssd = false;

    bool hint = !!vsapi->propGetInt(in, "hint", 0, &err);
    if (err)
        hint = true;

    VSNodeRef *clip2 = vsapi->propGetNode(in, "clip2", 0, &err);
    if (err)
        clip2 = vsapi->cloneNodeRef(clip); // simplifies the code in the getframe functions

    int sdlim = int64ToIntS(vsapi->propGetInt(in, "sdlim", 0, &err));
    if (err)
        sdlim = 0;

    int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
    if (err)
        opt = 4;

    const char *orgOut = vsapi->propGetData(in, "orgOut", 0, &err);
    if (err)
        orgOut = "";


    TDecimate *tdecimate_data;

    try {
        tdecimate_data = new TDecimate(clip, mode, cycleR, cycle, rate, dupThresh, vidThresh, sceneThresh, hybrid, vidDetect, conCycle, conCycleTP, ovr, output, input, tfmIn, mkvOut, nt, blockx, blocky, debug, display, vfrDec, batch, tcfv1, se, chroma, exPP, maxndl, m2PA, denoise, noblend, ssd, hint, clip2, sdlim, opt, orgOut, vsapi, core);
    } catch (const TIVTCError& e) {
        vsapi->setError(out, e.what());

        vsapi->freeNode(clip);
        vsapi->freeNode(clip2);

        return;
    }

    int filter_modes[8] = {
        fmParallelRequests,
        fmParallelRequests,
        fmUnordered, // Either fmUnordered or fmParallelRequests. I figured out which one but I didn't write it down and forgot.
        fmSerial,
        fmParallel,
        fmParallel,
        fmParallel,
        fmUnordered
    };
    int filter_flags[8] = {
        0,
        0,
        0,
        nfMakeLinear,
        0,
        0,
        0,
        0
    };

    vsapi->createFilter(in, out, "TDecimate", tdecimateInit, tdecimateGetFrame, tdecimateFree, filter_modes[mode], filter_flags[mode], tdecimate_data, core);

    if (vsapi->getError(out))
        return;


    if (display) {
        // text.FrameProps won't print the TDecimateDisplay property because it is too long,
        // so we use text.Text with std.FrameEval instead.
        VSMap *params = vsapi->createMap();
        VSNodeRef *node = vsapi->propGetNode(out, "clip", 0, nullptr);
        vsapi->propSetNode(params, "clip", node, paReplace);
        vsapi->propSetNode(params, "prop_src", node, paReplace);
        VSFuncRef *displayFuncRef = vsapi->createFunc(tivtcDisplayFunc<DisplayTDecimate>, vsapi->cloneNodeRef(node), (VSFreeFuncData)vsapi->freeNode, core, vsapi);
        vsapi->freeNode(node);
        vsapi->propSetFunc(params, "eval", displayFuncRef, paReplace);
        vsapi->freeFunc(displayFuncRef);
        VSPlugin *std_plugin = vsapi->getPluginById("com.vapoursynth.std", core);
        VSMap *ret = vsapi->invoke(std_plugin, "FrameEval", params);
        vsapi->freeMap(params);
        if (vsapi->getError(ret)) {
            char error[512] = { 0 };
            snprintf(error, 512, "TDecimate: failed to invoke std.FrameEval: %s", vsapi->getError(ret));
            vsapi->freeMap(ret);
            vsapi->setError(out, error);
            return;
        }
        node = vsapi->propGetNode(ret, "clip", 0, nullptr);
        vsapi->freeMap(ret);
        vsapi->propSetNode(out, "clip", node, paReplace);
        vsapi->freeNode(node);
    }
}


VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
    configFunc("com.nodame.tivtc", "tivtc", "Field matching and decimation", (3 << 16) | 5, 1, plugin);
    registerFunc("TFM",
                 "clip:clip;"
                 "order:int:opt;"
                 "field:int:opt;"
                 "mode:int:opt;"
                 "PP:int:opt;"
                 "ovr:data:opt;"
                 "input:data:opt;"
                 "output:data:opt;"
                 "outputC:data:opt;"
                 "debug:int:opt;"
                 "display:int:opt;"
                 "slow:int:opt;"
                 "mChroma:int:opt;"
                 "cNum:int:opt;"
                 "cthresh:int:opt;"
                 "MI:int:opt;"
                 "chroma:int:opt;"
                 "blockx:int:opt;"
                 "blocky:int:opt;"
                 "y0:int:opt;"
                 "y1:int:opt;"
                 "mthresh:int:opt;"
                 "clip2:clip:opt;"
                 "d2v:data:opt;"
                 "ovrDefault:int:opt;"
                 "flags:int:opt;"
                 "scthresh:float:opt;"
                 "micout:int:opt;"
                 "micmatching:int:opt;"
                 "trimIn:data:opt;"
                 "hint:int:opt;"
                 "metric:int:opt;"
                 "batch:int:opt;"
                 "ubsco:int:opt;"
                 "mmsco:int:opt;"
                 "opt:int:opt;"
                 , tfmCreate, nullptr, plugin);

    registerFunc("TDecimate",
                 "clip:clip;"
                 "mode:int:opt;"
                 "cycleR:int:opt;"
                 "cycle:int:opt;"
                 "rate:float:opt;"
                 "dupThresh:float:opt;"
                 "vidThresh:float:opt;"
                 "sceneThresh:float:opt;"
                 "hybrid:int:opt;"
                 "vidDetect:int:opt;"
                 "conCycle:int:opt;"
                 "conCycleTP:int:opt;"
                 "ovr:data:opt;"
                 "output:data:opt;"
                 "input:data:opt;"
                 "tfmIn:data:opt;"
                 "mkvOut:data:opt;"
                 "nt:int:opt;"
                 "blockx:int:opt;"
                 "blocky:int:opt;"
                 "debug:int:opt;"
                 "display:int:opt;"
                 "vfrDec:int:opt;"
                 "batch:int:opt;"
                 "tcfv1:int:opt;"
                 "se:int:opt;"
                 "chroma:int:opt;"
                 "exPP:int:opt;"
                 "maxndl:int:opt;"
                 "m2PA:int:opt;"
                 "denoise:int:opt;"
                 "noblend:int:opt;"
                 "ssd:int:opt;"
                 "hint:int:opt;"
                 "clip2:clip:opt;"
                 "sdlim:int:opt;"
                 "opt:int:opt;"
                 "orgOut:data:opt;"
                 , tdecimateCreate, nullptr, plugin);
}
07070100000005000081A4000000000000000000000001671240C90000BFED000000000000000000000000000000000000003200000000vapoursynth-tivtc-2+2.g7abd4a3/src/TCommonASM.cpp/*
**   Helper methods for TIVTC and TDeint
**
**
**   Copyright (C) 2004-2007 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TCommonASM.h"
#include "emmintrin.h"
#include "smmintrin.h" // SSE4
#include <algorithm>

void absDiff_SSE2(const uint8_t *srcp1, const uint8_t *srcp2,
  uint8_t *dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh1, int mthresh2)
{
  // for non-YUY2, mthresh1 and 2 are the same
  mthresh1 = std::min(std::max(255 - mthresh1, 0), 255);
  mthresh2 = std::min(std::max(255 - mthresh2, 0), 255);

  auto onesMask = _mm_set1_epi8(1);
  auto sthresh = _mm_set1_epi16((mthresh2 << 8) + mthresh1);
  auto all_ff = _mm_set1_epi8(-1);
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; x += 16)
    {
      auto src1 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp1 + x));
      auto src2 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp2 + x));
      auto diff12 = _mm_subs_epu8(src1, src2);
      auto diff21 = _mm_subs_epu8(src2, src1);
      auto diff = _mm_or_si128(diff12, diff21);
      auto addedsthresh = _mm_adds_epu8(diff, sthresh);
      auto cmpresult = _mm_cmpeq_epi8(addedsthresh, all_ff);
      auto res = _mm_xor_si128(cmpresult, all_ff);
      auto tmp = _mm_and_si128(res, onesMask);
      _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), tmp);
      /*
      if (abs(srcp1[x] - srcp2[x]) < mthresh1) dstp[x] = 1;
      else dstp[x] = 0;
      ++x;
      if (abs(srcp1[x] - srcp2[x]) < mthresh2) dstp[x] = 1;
      else dstp[x] = 0;
      */
    }
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }

}

// fills target byte buffer with 1 where absdiff is less that threshold, 0 otherwise
void absDiff_c(const uint8_t* srcp1, const uint8_t* srcp2,
  uint8_t* dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh1, int mthresh2)
{
  // for non-YUY2 mthresh1 and 2 are the same
  // dstp is a simple 1-byte format buffer (no high bit depth content)
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      if (abs(srcp1[x] - srcp2[x]) < mthresh1) dstp[x] = 1;
      else dstp[x] = 0;
      ++x; // next planar pixel or YUY2 chroma
      if (abs(srcp1[x] - srcp2[x]) < mthresh2) dstp[x] = 1;
      else dstp[x] = 0;
    }
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }
}

void absDiff_uint16_c(const uint8_t* srcp1, const uint8_t* srcp2,
  uint8_t* dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh)
{
  // dstp is a simple 1-byte format buffer (no high bit depth content)
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      if (abs(reinterpret_cast<const uint16_t *>(srcp1)[x] - reinterpret_cast<const uint16_t*>(srcp2)[x]) < mthresh)
        dstp[x] = 1;
      else
        dstp[x] = 0;
    }
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }
}

// different path if not mod16, but only for remaining 8 bytes
template<typename pixel_t>
void buildABSDiffMask_SSE2(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int rowsize,
  int height)
{
  __m128i diffpn, diffnp;

  if (!(rowsize & 15)) // exact mod16
  {
    while (height--) {
      for (int x = 0; x < rowsize; x += 16)
      {
        auto src_prev = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x));
        auto src_next = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x));
        if constexpr (sizeof(pixel_t) == 1) {
          diffpn = _mm_subs_epu8(src_prev, src_next);
          diffnp = _mm_subs_epu8(src_next, src_prev);
        }
        else {
          diffpn = _mm_subs_epu16(src_prev, src_next);
          diffnp = _mm_subs_epu16(src_next, src_prev);
        }
        __m128i diff = _mm_or_si128(diffpn, diffnp);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), diff);
      }
      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
  else {
    rowsize -= 8; // last chunk is 8 bytes instead of 16
    while (height--) {
      int x;
      for (x = 0; x < rowsize; x += 16)
      {
        __m128i src_prev = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x));
        __m128i src_next = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x));
        if constexpr (sizeof(pixel_t) == 1) {
          diffpn = _mm_subs_epu8(src_prev, src_next);
          diffnp = _mm_subs_epu8(src_next, src_prev);
        }
        else {
          diffpn = _mm_subs_epu16(src_prev, src_next);
          diffnp = _mm_subs_epu16(src_next, src_prev);
        }
        __m128i diff = _mm_or_si128(diffpn, diffnp);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), diff);
      }
      // remaining half block
      __m128i src_prev = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(prvp + x));
      __m128i src_next = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(nxtp + x));
      if constexpr (sizeof(pixel_t) == 1) {
        diffpn = _mm_subs_epu8(src_prev, src_next);
        diffnp = _mm_subs_epu8(src_next, src_prev);
      }
      else {
        diffpn = _mm_subs_epu16(src_prev, src_next);
        diffnp = _mm_subs_epu16(src_next, src_prev);
      }
      __m128i diff = _mm_or_si128(diffpn, diffnp);
      _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp + x), diff);
      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
}


template<typename pixel_t>
void buildABSDiffMask_c(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height)
{
  if (width <= 0)
    return;

 {
    for (int y = 0; y < height; ++y)
    {
      for (int x = 0; x < width; x++)
      {
        reinterpret_cast<pixel_t *>(dstp)[x] = abs(reinterpret_cast<const pixel_t*>(prvp)[x] - reinterpret_cast<const pixel_t*>(nxtp)[x]);
      }
      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
}

template<typename pixel_t>
void do_buildABSDiffMask(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* tbuffer,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height, const CPUFeatures *cpuFlags)
{
  if (cpuFlags->sse2 && width >= 8)
  {
    const int rowsize = width * sizeof(pixel_t);
    const int rowsizemod8 = rowsize / 8 * 8;
    // SSE2 is not YUY2 chroma-ignore template, it's quicker if not skipping each YUY2 chroma
    buildABSDiffMask_SSE2<pixel_t>(prvp, nxtp, tbuffer, prv_pitch, nxt_pitch, tpitch, rowsizemod8, height);
      buildABSDiffMask_c<pixel_t>(
        prvp + rowsizemod8,
        nxtp + rowsizemod8,
        tbuffer + rowsizemod8,
        prv_pitch, nxt_pitch, tpitch,
        width - rowsizemod8 / sizeof(pixel_t),
        height);
  }
  else {
      buildABSDiffMask_c<pixel_t>(prvp, nxtp, tbuffer, prv_pitch, nxt_pitch, tpitch, width, height);
  }
}
// instantiate
template void do_buildABSDiffMask<uint8_t>(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* tbuffer,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height, const CPUFeatures *cpuFlags);
template void do_buildABSDiffMask<uint16_t>(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* tbuffer,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height, const CPUFeatures *cpuFlags);


template<typename pixel_t>
void do_buildABSDiffMask2(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* dstp,
  int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpuFlags, int bits_per_pixel)
{
  if (cpuFlags->sse2 && width >= 8) // yes, width and not row_size
  {
    int mod8Width = width / 8 * 8;
    if constexpr(sizeof(pixel_t) == 8)
      buildABSDiffMask2_uint8_SSE2(prvp, nxtp, dstp, prv_pitch, nxt_pitch, dst_pitch, mod8Width, height);
    else
      buildABSDiffMask2_uint16_SSE2(prvp, nxtp, dstp, prv_pitch, nxt_pitch, dst_pitch, mod8Width, height, bits_per_pixel);

    buildABSDiffMask2_c<pixel_t>(
        prvp + mod8Width * sizeof(pixel_t), 
        nxtp + mod8Width * sizeof(pixel_t),
        dstp + mod8Width, // dstp is really 8 bits
        prv_pitch, nxt_pitch, dst_pitch, width - mod8Width, height, bits_per_pixel);
  }
  else {
      buildABSDiffMask2_c<pixel_t>(prvp, nxtp, dstp, prv_pitch, nxt_pitch, dst_pitch, width, height, bits_per_pixel);
  }
}
// instantiate
template void do_buildABSDiffMask2<uint8_t>(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* dstp,
  int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpuFlags, int bits_per_pixel);
template void do_buildABSDiffMask2<uint16_t>(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* dstp,
  int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpuFlags, int bits_per_pixel);

// Finally this is common for TFM and TDeint, planar and YUY2 (luma, luma+chroma))
// This C code replaces some thousand line of copy pasted original inline asm lines
// (plus handles 10+bits)

// distance of neighboring pixels:
// 1 for planar any
// 2 for YUY2 luma
// 4 for YUY2 chroma
template<typename pixel_t, int bits_per_pixel, int DIST>
static AVS_FORCEINLINE void AnalyzeOnePixel(uint8_t* dstp,
  const pixel_t* dppp, const pixel_t* dpp,
  const pixel_t* dp,
  const pixel_t* dpn, const pixel_t* dpnn,
  int& x, int& y, int& Width, int& Height)
{
  constexpr int Const3 = 3 << (bits_per_pixel - 8);
  constexpr int Const19 = 19 << (bits_per_pixel - 8);

  if (dp[x] <= Const3)
    return;

  if (dp[x - DIST] <= Const3 && dp[x + DIST] <= Const3 &&
    dpp[x - DIST] <= Const3 && dpp[x] <= Const3 && dpp[x + DIST] <= Const3 &&
    dpn[x - DIST] <= Const3 && dpn[x] <= Const3 && dpn[x + DIST] <= Const3)
    return;

  dstp[x]++;

  if (dp[x] <= Const19)
    return;

  int edi = 0;
  int lower = 0;
  int upper = 0;

  if (dpp[x - DIST] > Const19) edi++;
  if (dpp[x] > Const19) edi++;
  if (dpp[x + DIST] > Const19) edi++;

  if (edi != 0) upper = 1;

  if (dp[x - DIST] > Const19) edi++;
  if (dp[x + DIST] > Const19) edi++;

  int esi = edi;

  if (dpn[x - DIST] > Const19) edi++;
  if (dpn[x] > Const19) edi++;
  if (dpn[x + DIST] > Const19) edi++;

  if (edi <= 2)
    return;

  int count = edi;
  if (count != esi) {
    lower = 1;
    if (upper != 0) {
      dstp[x] += 2;
      return;
    }
  }

  int lower2 = 0;
  int upper2 = 0;

  int startx, stopx;

  constexpr bool YUY2_chroma = (DIST == 4);

  if (YUY2_chroma) {
    const int firstchroma = (x & 2) + 1;
    startx = x - 4 * 4 < firstchroma ? firstchroma : x - 4 * 4;
    stopx = x + 4 * 4 + 2 > Width ? Width : x + 4 * 4 + 2;
  }
  else {
    startx = x < 4 * DIST ? 0 : x - 4 * DIST;
    stopx = x + 4 * DIST + DIST > Width ? Width : x + 4 * DIST + DIST;
  }

  if (y != 2) {
    for (esi = startx; esi < stopx; esi += DIST) {
      if (dppp[esi] > Const19) {
        upper2 = 1;
        break;
      }
    }
  }

  for (esi = startx; esi < stopx; esi += DIST)
  {
    if (dpp[esi] > Const19)
      upper = 1;
    if (dpn[esi] > Const19)
      lower = 1;
    if (upper != 0 && lower != 0)
      break;
  }

  if (y != Height - 4) {
    for (esi = startx; esi < stopx; esi += DIST)
    {
      if (dpnn[esi] > Const19) {
        lower2 = 1;
        break;
      }
    }
  }

  if (upper == 0) {
    if (lower == 0 || lower2 == 0) {
      if (count > 4)
        dstp[x] += 4;
    }
    else {
      dstp[x] += 2;
    }
  }
  else {
    if (lower != 0 || upper2 != 0) {
      dstp[x] += 2;
    }
    else {
      if (count > 4)
        dstp[x] += 4;
    }
  }
}

// Common TDeint and TFM version
template<typename pixel_t, int bits_per_pixel>
void AnalyzeDiffMask_Planar(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height)
{
  tpitch /= sizeof(pixel_t);
  const pixel_t* tbuffer = reinterpret_cast<const pixel_t*>(tbuffer8);
  const pixel_t* dppp = tbuffer - tpitch;
  const pixel_t* dpp = tbuffer;
  const pixel_t* dp = tbuffer + tpitch;
  const pixel_t* dpn = tbuffer + tpitch * 2;
  const pixel_t* dpnn = tbuffer + tpitch * 3;

  for (int y = 2; y < Height - 2; y += 2) {
    for (int x = 1; x < Width - 1; x++) {
      AnalyzeOnePixel<pixel_t, bits_per_pixel, 1>(dstp, dppp, dpp, dp, dpn, dpnn, x, y, Width, Height);
    }
    dppp += tpitch;
    dpp += tpitch;
    dp += tpitch;
    dpn += tpitch;
    dpnn += tpitch;
    dstp += dst_pitch;
  }
}
// instantiate
template void AnalyzeDiffMask_Planar<uint8_t,8>(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height);
template void AnalyzeDiffMask_Planar<uint16_t, 10>(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height);
template void AnalyzeDiffMask_Planar<uint16_t, 12>(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height);
template void AnalyzeDiffMask_Planar<uint16_t, 14>(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height);
template void AnalyzeDiffMask_Planar<uint16_t, 16>(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer8, int tpitch, int Width, int Height);

// HBD ready
template<typename pixel_t>
void buildABSDiffMask2_c(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, int bits_per_pixel)
{
  if (width <= 0)
    return;

  constexpr int inc = 1;
  const int Const19 = 19 << (bits_per_pixel - 8);
  const int Const3 = 3 << (bits_per_pixel - 8);
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; x += inc)
    {
      const int diff = abs(reinterpret_cast<const pixel_t *>(prvp)[x] - reinterpret_cast<const pixel_t*>(nxtp)[x]);
      if (diff > Const19) dstp[x] = 3;
      else if (diff > Const3) dstp[x] = 1;
      else dstp[x] = 0;
    }
    prvp += prv_pitch;
    nxtp += nxt_pitch;
    dstp += dst_pitch;
  }
}


static AVS_FORCEINLINE __m128i _MM_CMPLE_EPU16(__m128i x, __m128i y)
{
  // Returns 0xFFFF where x <= y:
  return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
}

void buildABSDiffMask2_uint8_SSE2(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width,
  int height)
{
  auto onesMask = _mm_set1_epi8(0x01); // byte target!
  auto twosMask = _mm_set1_epi8(0x02);
  auto all_ff = _mm_set1_epi8(-1);
  // C version: 19 and 3
  // 255 - 1 - 19 = 235
  // 255 - 1 - 3 = 251

  // diff > 19 => diff - 19 > 0 => 
  // diff - 19 >= 1 => diff - 19 - 1 +255 >= 255 =>
  // add_satutare(diff, 255 - 19 - 1) == 255
  const int Const251 = 255 - 1 - 3;
  const int Const235 = 255 - 1 - 19;

  auto Compare251 = _mm_set1_epi8((char)Const251);
  auto Compare235 = _mm_set1_epi8((char)Const235);

  if (!(width & 15)) // exact mod16
  {
    while (height--) {
      for (int x = 0; x < width; x += 16)
      {
        __m128i src_prev = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x));
        __m128i src_next = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x));
        __m128i diffpn = _mm_subs_epu8(src_prev, src_next);
        __m128i diffnp = _mm_subs_epu8(src_next, src_prev);
        __m128i diff = _mm_or_si128(diffpn, diffnp);
        /*
        const int diff = abs(prvp[x] - nxtp[x]);
        if (diff > 19) dstp[x] |= 2; // 2 + 1
        if (diff > 3) dstp[x] |= 1;
        else dstp[x] = 0;

        */
        __m128i added251 = _mm_adds_epu8(diff, Compare251);
        __m128i added235 = _mm_adds_epu8(diff, Compare235);
        auto cmp251 = _mm_cmpeq_epi8(added251, all_ff);
        auto cmp235 = _mm_cmpeq_epi8(added235, all_ff);
        // target is byte buffer
        __m128i tmp1 = _mm_and_si128(cmp251, onesMask);
        __m128i tmp2 = _mm_and_si128(cmp235, twosMask);
        __m128i tmp = _mm_or_si128(tmp1, tmp2);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), tmp);
      }
      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
  else {
    width -= 8; // last chunk is 8 bytes instead of 16
    while (height--) {
      int x; // intentionally not in 'for'
      for (x = 0; x < width; x += 16)
      {
        __m128i src_prev = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x));
        __m128i src_next = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x));
        __m128i diffpn = _mm_subs_epu8(src_prev, src_next);
        __m128i diffnp = _mm_subs_epu8(src_next, src_prev);
        __m128i diff = _mm_or_si128(diffpn, diffnp);
        /*
        const int diff = abs(prvp[x] - nxtp[x]);
        if (diff > 19) dstp[x] |= 2; // 2 + 1
        if (diff > 3) dstp[x] |= 1;
        else dstp[x] = 0;
        */
        __m128i added251 = _mm_adds_epu8(diff, Compare251);
        __m128i added235 = _mm_adds_epu8(diff, Compare235);
        auto cmp251 = _mm_cmpeq_epi8(added251, all_ff);
        auto cmp235 = _mm_cmpeq_epi8(added235, all_ff);
        // target is byte buffer
        __m128i tmp1 = _mm_and_si128(cmp251, onesMask);
        __m128i tmp2 = _mm_and_si128(cmp235, twosMask);
        __m128i tmp = _mm_or_si128(tmp1, tmp2);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), tmp);
      }
      // rest 8 bytes
      __m128i src_prev = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(prvp + x));
      __m128i src_next = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(nxtp + x));
      __m128i diffpn = _mm_subs_epu8(src_prev, src_next);
      __m128i diffnp = _mm_subs_epu8(src_next, src_prev);
      __m128i diff = _mm_or_si128(diffpn, diffnp);
      __m128i added251 = _mm_adds_epu8(diff, Compare251);
      __m128i added235 = _mm_adds_epu8(diff, Compare235);
      auto cmp251 = _mm_cmpeq_epi8(added251, all_ff);
      auto cmp235 = _mm_cmpeq_epi8(added235, all_ff);
      __m128i tmp1 = _mm_and_si128(cmp251, onesMask);
      __m128i tmp2 = _mm_and_si128(cmp235, twosMask);
      __m128i tmp = _mm_or_si128(tmp1, tmp2);
      _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp + x), tmp);

      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
}

void buildABSDiffMask2_uint16_SSE2(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width,
  int height, int bits_per_pixel)
{
  auto onesMask = _mm_set1_epi8(0x01); // byte target!
  auto twosMask = _mm_set1_epi8(0x02);
  // C version: 19 and 3

  const int Const19plus1 = (19 << (bits_per_pixel - 8)) + 1;
  const int Const3plus1 = (3 << (bits_per_pixel - 8)) + 1;

  auto Compare19plus1 = _mm_set1_epi16((short)Const19plus1);
  auto Compare3plus1 = _mm_set1_epi16((short)Const3plus1);

  if (!(width & 15)) // exact mod16
  {
    while (height--) {
      for (int x = 0; x < width; x += 16)
      {
        // 16 byte result needs 32 byte source (16 x uint16_t pixels)

        /*
        const int diff = abs(prvp[x] - nxtp[x]);
        if (diff > Const19) dstp[x] |= 2; // 2 + 1
        if (diff > Const3) dstp[x] |= 1;
        else dstp[x] = 0;

        if (diff > 19) ==> diff >= 19+1
        if (diff > 3) ==> diff >= 3+1
        */

        auto src_prev_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x * 2));
        auto src_next_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x * 2));
        auto diffpn_lo = _mm_subs_epu16(src_prev_lo, src_next_lo);
        auto diffnp_lo = _mm_subs_epu16(src_next_lo, src_prev_lo);
        auto diff_lo = _mm_or_si128(diffpn_lo, diffnp_lo);

        auto cmp19_lo = _MM_CMPLE_EPU16(Compare19plus1, diff_lo); // FFFF where 20 <= diff (19 < diff)
        auto cmp3_lo = _MM_CMPLE_EPU16(Compare3plus1, diff_lo); // FFFF where 4 <= diff (3 < diff)

        auto src_prev_hi = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x * 2 + 16));
        auto src_next_hi = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x * 2 + 16));
        auto diffpn_hi = _mm_subs_epu16(src_prev_hi, src_next_hi);
        auto diffnp_hi = _mm_subs_epu16(src_next_hi, src_prev_hi);
        auto diff_hi = _mm_or_si128(diffpn_hi, diffnp_hi);

        auto cmp19_hi = _MM_CMPLE_EPU16(Compare19plus1, diff_hi); // FFFF where 20 <= diff (19 < diff)
        auto cmp3_hi = _MM_CMPLE_EPU16(Compare3plus1, diff_hi); // FFFF where 4 <= diff (3 < diff)

        // make bytes from wordBools
        auto cmp251 = _mm_packus_epi16(cmp3_lo, cmp3_hi);
        auto cmp235 = _mm_packus_epi16(cmp19_lo, cmp19_hi);

        // target is byte buffer!
        auto tmp1 = _mm_and_si128(cmp251, onesMask);
        auto  tmp2 = _mm_and_si128(cmp235, twosMask);
        auto  tmp = _mm_or_si128(tmp1, tmp2);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), tmp);
      }
      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
  else {
    width -= 8; // last chunk is 8 bytes instead of 16
    while (height--) {
      int x; // intentionally not in 'for'
      for (x = 0; x < width; x += 16)
      {
        auto src_prev_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x * 2));
        auto src_next_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x * 2));
        auto diffpn_lo = _mm_subs_epu16(src_prev_lo, src_next_lo);
        auto diffnp_lo = _mm_subs_epu16(src_next_lo, src_prev_lo);
        auto diff_lo = _mm_or_si128(diffpn_lo, diffnp_lo);

        auto cmp19_lo = _MM_CMPLE_EPU16(Compare19plus1, diff_lo); // FFFF where 20 <= diff (19 < diff)
        auto cmp3_lo = _MM_CMPLE_EPU16(Compare3plus1, diff_lo); // FFFF where 4 <= diff (3 < diff)

        auto src_prev_hi = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x * 2 + 16));
        auto src_next_hi = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x * 2 + 16));
        auto diffpn_hi = _mm_subs_epu16(src_prev_hi, src_next_hi);
        auto diffnp_hi = _mm_subs_epu16(src_next_hi, src_prev_hi);
        auto diff_hi = _mm_or_si128(diffpn_hi, diffnp_hi);

        auto cmp19_hi = _MM_CMPLE_EPU16(Compare19plus1, diff_hi); // FFFF where 20 <= diff (19 < diff)
        auto cmp3_hi = _MM_CMPLE_EPU16(Compare3plus1, diff_hi); // FFFF where 4 <= diff (3 < diff)

        // make bytes from wordBools
        auto cmp251 = _mm_packus_epi16(cmp3_lo, cmp3_hi);
        auto cmp235 = _mm_packus_epi16(cmp19_lo, cmp19_hi);

        // target is byte buffer!
        auto tmp1 = _mm_and_si128(cmp251, onesMask);
        auto  tmp2 = _mm_and_si128(cmp235, twosMask);
        auto  tmp = _mm_or_si128(tmp1, tmp2);
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), tmp);
      }
      // rest 8 pixels
      auto src_prev_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(prvp + x * 2));
      auto src_next_lo = _mm_load_si128(reinterpret_cast<const __m128i*>(nxtp + x * 2));
      auto diffpn_lo = _mm_subs_epu16(src_prev_lo, src_next_lo);
      auto diffnp_lo = _mm_subs_epu16(src_next_lo, src_prev_lo);
      auto diff_lo = _mm_or_si128(diffpn_lo, diffnp_lo);

      auto cmp19_lo = _MM_CMPLE_EPU16(Compare19plus1, diff_lo); // FFFF where 20 <= diff (19 < diff)
      auto cmp3_lo = _MM_CMPLE_EPU16(Compare3plus1, diff_lo); // FFFF where 4 <= diff (3 < diff)

      // make bytes from wordBools
      auto cmp251 = _mm_packus_epi16(cmp3_lo, cmp3_lo); // 8 bytes valid only
      auto cmp235 = _mm_packus_epi16(cmp19_lo, cmp19_lo);

      // target is byte buffer!
      auto tmp1 = _mm_and_si128(cmp251, onesMask);
      auto  tmp2 = _mm_and_si128(cmp235, twosMask);
      auto  tmp = _mm_or_si128(tmp1, tmp2);
      // store 8 bytes
      _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp + x), tmp);

      prvp += prv_pitch;
      nxtp += nxt_pitch;
      dstp += dst_pitch;
    }
  }
}

template<typename pixel_t>
void check_combing_c(const pixel_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, int cthresh)
{
  // cthresh is scaled to actual bit depth
  const pixel_t* srcppp = srcp - src_pitch * 2;
  const pixel_t* srcpp = srcp - src_pitch;
  const pixel_t* srcpn = srcp + src_pitch;
  const pixel_t* srcpnn = srcp + src_pitch * 2;

  int increment = 1;

  const int cthresh6 = cthresh * 6;
  // no luma masking
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; x += increment)
    {
      const int sFirst = srcp[x] - srcpp[x];
      const int sSecond = srcp[x] - srcpn[x];
      if ((sFirst > cthresh && sSecond > cthresh) || (sFirst < -cthresh && sSecond < -cthresh))
      {
        if (abs(srcppp[x] + (srcp[x] << 2) + srcpnn[x] - (3 * (srcpp[x] + srcpn[x]))) > cthresh6)
          cmkp[x] = 0xFF;
      }
    }
    srcppp += src_pitch;
    srcpp += src_pitch;
    srcp += src_pitch;
    srcpn += src_pitch;
    srcpnn += src_pitch;
    cmkp += cmk_pitch;
  }
}
// instantiate
template void check_combing_c<uint8_t>(const uint8_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, int cthresh);
template void check_combing_c<uint16_t>(const uint16_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, int cthresh);

template<typename pixel_t, typename safeint_t>
void check_combing_c_Metric1(const pixel_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, safeint_t cthreshsq)
{
  // cthresh is scaled to actual bit depth
  const pixel_t* srcpp = srcp - src_pitch;
  const pixel_t* srcpn = srcp + src_pitch;

  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      if ((safeint_t)(srcp[x] - srcpp[x]) * (srcp[x] - srcpn[x]) > cthreshsq)
        cmkp[x] = 0xFF;
    }
    srcpp += src_pitch;
    srcp += src_pitch;
    srcpn += src_pitch;
    cmkp += cmk_pitch;
  }
}
// instantiate
template void check_combing_c_Metric1<uint8_t, int>(const uint8_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, int cthreshsq);
template void check_combing_c_Metric1<uint16_t, int64_t>(const uint16_t* srcp, uint8_t* cmkp, int width, int height, int src_pitch, int cmk_pitch, int64_t cthreshsq);



static void check_combing_SSE2_generic(const uint8_t *srcp, uint8_t *dstp, int width,
  int height, int src_pitch, int dst_pitch, int cthresh)
{
  unsigned int cthresht = std::min(std::max(255 - cthresh - 1, 0), 255);
  auto threshb = _mm_set1_epi8(cthresht);
  unsigned int cthresh6t = std::min(std::max(65535 - cthresh * 6 - 1, 0), 65535);
  auto thresh6w = _mm_set1_epi16(cthresh6t);

  __m128i all_ff = _mm_set1_epi8(-1);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto next = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch + x));
      auto diff_curr_next = _mm_subs_epu8(curr, next);
      auto diff_next_curr = _mm_subs_epu8(next, curr);
      auto diff_curr_prev = _mm_subs_epu8(curr, prev);
      auto diff_prev_curr = _mm_subs_epu8(prev, curr);
      // max(min(p-s,n-s), min(s-n,s-p))
      auto xmm2_max = _mm_max_epu8(_mm_min_epu8(diff_prev_curr, diff_next_curr), _mm_min_epu8(diff_curr_next, diff_curr_prev));
      auto xmm2_cmp = _mm_cmpeq_epi8(_mm_adds_epu8(xmm2_max, threshb), all_ff);

      auto res_part1 = xmm2_cmp;
      bool cmpres_is_allzero;
#ifdef _M_X64
      cmpres_is_allzero = (_mm_cvtsi128_si64(xmm2_cmp) | _mm_cvtsi128_si64(_mm_srli_si128(xmm2_cmp, 8))) == 0; // _si64: only at x64 platform
#else
      cmpres_is_allzero = (_mm_cvtsi128_si32(xmm2_cmp) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 4)) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 8)) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 12))
        ) == 0;
#endif
        if (!cmpres_is_allzero) {
          // output2
          auto zero = _mm_setzero_si128();
          // compute 3*(p+n)
          auto next_lo = _mm_unpacklo_epi8(next, zero);
          auto prev_lo = _mm_unpacklo_epi8(prev, zero);
          auto next_hi = _mm_unpackhi_epi8(next, zero);
          auto prev_hi = _mm_unpackhi_epi8(prev, zero);
          __m128i three = _mm_set1_epi16(3);
          auto mul_lo = _mm_mullo_epi16(_mm_adds_epu16(next_lo, prev_lo), three);
          auto mul_hi = _mm_mullo_epi16(_mm_adds_epu16(next_hi, prev_hi), three);

          // compute (pp+c*4+nn)
          auto prevprev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch * 2 + x));
          auto prevprev_lo = _mm_unpacklo_epi8(prevprev, zero);
          auto prevprev_hi = _mm_unpackhi_epi8(prevprev, zero);
          auto curr_lo = _mm_unpacklo_epi8(curr, zero);
          auto curr_hi = _mm_unpackhi_epi8(curr, zero);
          auto sum2_lo = _mm_adds_epu16(_mm_slli_epi16(curr_lo, 2), prevprev_lo); // pp + c*4
          auto sum2_hi = _mm_adds_epu16(_mm_slli_epi16(curr_hi, 2), prevprev_hi); // pp + c*4

          auto nextnext = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch * 2 + x));
          auto nextnext_lo = _mm_unpacklo_epi8(nextnext, zero);
          auto nextnext_hi = _mm_unpackhi_epi8(nextnext, zero);
          auto sum3_lo = _mm_adds_epu16(sum2_lo, nextnext_lo);
          auto sum3_hi = _mm_adds_epu16(sum2_hi, nextnext_hi);

          // working with sum3=(pp+c*4+nn)   and  mul=3*(p+n)
          auto diff_sum3lo_mullo = _mm_subs_epu16(sum3_lo, mul_lo);
          auto diff_mullo_sum3lo = _mm_subs_epu16(mul_lo, sum3_lo);
          auto diff_sum3hi_mulhi = _mm_subs_epu16(sum3_hi, mul_hi);
          auto diff_mulhi_sum3hi = _mm_subs_epu16(mul_hi, sum3_hi);
          // abs( (pp+c*4+nn) - mul=3*(p+n) )
          auto max_lo = _mm_max_epi16(diff_sum3lo_mullo, diff_mullo_sum3lo);
          auto max_hi = _mm_max_epi16(diff_sum3hi_mulhi, diff_mulhi_sum3hi);
          // abs( (pp+c*4+nn) - mul=3*(p+n) ) + thresh6w
          auto lo_thresh6w_added = _mm_adds_epu16(max_lo, thresh6w);
          auto hi_thresh6w_added = _mm_adds_epu16(max_hi, thresh6w);
          // maximum reached?
          auto cmp_lo = _mm_cmpeq_epi16(lo_thresh6w_added, all_ff);
          auto cmp_hi = _mm_cmpeq_epi16(hi_thresh6w_added, all_ff);

          auto res_lo = _mm_srli_epi16(cmp_lo, 8);
          auto res_hi = _mm_srli_epi16(cmp_hi, 8);
          auto res_part2 = _mm_packus_epi16(res_lo, res_hi);

          auto res = _mm_and_si128(res_part1, res_part2);
          _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
        }
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


void check_combing_SSE2(const uint8_t *srcp, uint8_t *dstp, int width, int height, int src_pitch, int dst_pitch, int cthresh)
{
  check_combing_SSE2_generic(srcp, dstp, width, height, src_pitch, dst_pitch, cthresh);
}


#if defined(GCC) || defined(CLANG)
__attribute__((__target__("sse4.1")))
#endif 
void check_combing_uint16_SSE4(const uint16_t* srcp, uint8_t* dstp, int width, int height, int src_pitch, int dst_pitch, int cthresh)
{
  // src_pitch ok for the 16 bit pointer
/*
  const int sFirst = srcp[x] - srcpp[x];
  const int sSecond = srcp[x] - srcpn[x];
  if ((sFirst > cthresh && sSecond > cthresh) || (sFirst < -cthresh && sSecond < -cthresh))
  {
    if (abs(srcppp[x] + (srcp[x] << 2) + srcpnn[x] - (3 * (srcpp[x] + srcpn[x]))) > cthresh6)
      cmkp[x] = 0xFF;
  }
*/
  unsigned int cthresht = std::min(std::max(65535 - cthresh - 1, 0), 65535);
  auto thresh = _mm_set1_epi16(cthresht); // cmp by adds and check saturation

  auto thresh6 = _mm_set1_epi32(cthresh * 6);

  __m128i all_ff = _mm_set1_epi8(-1);
  while (height--) {
    // sets 8 mask byte by 8x uint16_t pixels
    for (int x = 0; x < width; x += 16 / sizeof(uint16_t)) {
      auto next = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + x));
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp - src_pitch + x));
      auto diff_curr_next = _mm_subs_epu16(curr, next);
      auto diff_next_curr = _mm_subs_epu16(next, curr);
      auto diff_curr_prev = _mm_subs_epu16(curr, prev);
      auto diff_prev_curr = _mm_subs_epu16(prev, curr);
      // max(min(p-s,n-s), min(s-n,s-p))
      // instead of abs
      auto xmm2_max = _mm_max_epu16(_mm_min_epu16(diff_prev_curr, diff_next_curr), _mm_min_epu16(diff_curr_next, diff_curr_prev));
      auto xmm2_cmp = _mm_cmpeq_epi16(_mm_adds_epu16(xmm2_max, thresh), all_ff);

      auto res_part1 = xmm2_cmp;
      bool cmpres_is_allzero;
#ifdef _M_X64
      cmpres_is_allzero = (_mm_cvtsi128_si64(xmm2_cmp) | _mm_cvtsi128_si64(_mm_srli_si128(xmm2_cmp, 8))) == 0; // _si64: only at x64 platform
#else
      cmpres_is_allzero = (_mm_cvtsi128_si32(xmm2_cmp) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 4)) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 8)) |
        _mm_cvtsi128_si32(_mm_srli_si128(xmm2_cmp, 12))
        ) == 0;
#endif
      if (!cmpres_is_allzero) {
        // output2
        auto zero = _mm_setzero_si128();
        // compute 3*(p+n)
        auto next_lo = _mm_unpacklo_epi16(next, zero);
        auto prev_lo = _mm_unpacklo_epi16(prev, zero);
        auto next_hi = _mm_unpackhi_epi16(next, zero);
        auto prev_hi = _mm_unpackhi_epi16(prev, zero);
        __m128i three = _mm_set1_epi32(3);
        auto mul_lo = _mm_mullo_epi32(_mm_add_epi32(next_lo, prev_lo), three);
        auto mul_hi = _mm_mullo_epi32(_mm_add_epi32(next_hi, prev_hi), three);

        // compute (pp+c*4+nn)
        auto prevprev = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp - src_pitch * 2 + x));
        auto prevprev_lo = _mm_unpacklo_epi16(prevprev, zero);
        auto prevprev_hi = _mm_unpackhi_epi16(prevprev, zero);
        auto curr_lo = _mm_unpacklo_epi16(curr, zero);
        auto curr_hi = _mm_unpackhi_epi16(curr, zero);
        auto sum2_lo = _mm_add_epi32(_mm_slli_epi32(curr_lo, 2), prevprev_lo); // pp + c*4
        auto sum2_hi = _mm_add_epi32(_mm_slli_epi32(curr_hi, 2), prevprev_hi); // pp + c*4

/*        if (abs(srcppp[x] + (srcp[x] << 2) + srcpnn[x] - (3 * (srcpp[x] + srcpn[x]))) > cthresh6)
          cmkp[x] = 0xFF;
          */
        auto nextnext = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + src_pitch * 2 + x));
        auto nextnext_lo = _mm_unpacklo_epi16(nextnext, zero);
        auto nextnext_hi = _mm_unpackhi_epi16(nextnext, zero);
        auto sum3_lo = _mm_add_epi32(sum2_lo, nextnext_lo);
        auto sum3_hi = _mm_add_epi32(sum2_hi, nextnext_hi);

        // working with sum3=(pp+c*4+nn)   and  mul=3*(p+n)
        auto diff_sum3lo_mullo = _mm_sub_epi32(sum3_lo, mul_lo);
        auto diff_sum3hi_mulhi = _mm_sub_epi32(sum3_hi, mul_hi);
        // abs( (pp+c*4+nn) - mul=3*(p+n) )
        auto abs_lo = _mm_abs_epi32(diff_sum3lo_mullo);
        auto abs_hi = _mm_abs_epi32(diff_sum3hi_mulhi);
        // abs( (pp+c*4+nn) - mul=3*(p+n) ) > thresh6 ??
        auto cmp_lo = _mm_cmpgt_epi32(abs_lo, thresh6);
        auto cmp_hi = _mm_cmpgt_epi32(abs_hi, thresh6);

        auto res_part2 = _mm_packs_epi32(cmp_lo, cmp_hi);

        auto res = _mm_and_si128(res_part1, res_part2);
        // mask is 8 bits
        res = _mm_packs_epi16(res, res);
        _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp + x), res);
      }
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


void check_combing_SSE2_Metric1(const uint8_t *srcp, uint8_t *dstp,
  int width, int height, int src_pitch, int dst_pitch, int cthreshsq)
{
  __m128i thresh = _mm_set1_epi32(cthreshsq);
  __m128i zero = _mm_setzero_si128();
  __m128i lumaMask = _mm_set1_epi16(0x00FF);

  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto next = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch + x));

      auto prev_lo = _mm_unpacklo_epi8(prev, zero);
      auto prev_hi = _mm_unpackhi_epi8(prev, zero);
      auto curr_lo = _mm_unpacklo_epi8(curr, zero);
      auto curr_hi = _mm_unpackhi_epi8(curr, zero);
      auto next_lo = _mm_unpacklo_epi8(next, zero);
      auto next_hi = _mm_unpackhi_epi8(next, zero);

      auto diff_prev_curr_lo = _mm_subs_epi16(prev_lo, curr_lo);
      auto diff_next_curr_lo = _mm_subs_epi16(next_lo, curr_lo);
      auto diff_prev_curr_hi = _mm_subs_epi16(prev_hi, curr_hi);
      auto diff_next_curr_hi = _mm_subs_epi16(next_hi, curr_hi);

      // -- lo
      auto diff_prev_curr_lo_lo = _mm_unpacklo_epi16(diff_prev_curr_lo, zero);
      auto diff_prev_curr_lo_hi = _mm_unpackhi_epi16(diff_prev_curr_lo, zero);
      auto diff_next_curr_lo_lo = _mm_unpacklo_epi16(diff_next_curr_lo, zero);
      auto diff_next_curr_lo_hi = _mm_unpackhi_epi16(diff_next_curr_lo, zero);

      auto res_lo_lo = _mm_madd_epi16(diff_prev_curr_lo_lo, diff_next_curr_lo_lo);
      auto res_lo_hi = _mm_madd_epi16(diff_prev_curr_lo_hi, diff_next_curr_lo_hi);

      // -- hi
      auto diff_prev_curr_hi_lo = _mm_unpacklo_epi16(diff_prev_curr_hi, zero);
      auto diff_prev_curr_hi_hi = _mm_unpackhi_epi16(diff_prev_curr_hi, zero);
      auto diff_next_curr_hi_lo = _mm_unpacklo_epi16(diff_next_curr_hi, zero);
      auto diff_next_curr_hi_hi = _mm_unpackhi_epi16(diff_next_curr_hi, zero);

      auto res_hi_lo = _mm_madd_epi16(diff_prev_curr_hi_lo, diff_next_curr_hi_lo);
      auto res_hi_hi = _mm_madd_epi16(diff_prev_curr_hi_hi, diff_next_curr_hi_hi);

      auto cmp_lo_lo = _mm_cmpgt_epi32(res_lo_lo, thresh);
      auto cmp_lo_hi = _mm_cmpgt_epi32(res_lo_hi, thresh);
      auto cmp_hi_lo = _mm_cmpgt_epi32(res_hi_lo, thresh);
      auto cmp_hi_hi = _mm_cmpgt_epi32(res_hi_hi, thresh);

      auto cmp_lo = _mm_packs_epi32(cmp_lo_lo, cmp_lo_hi);
      auto cmp_hi = _mm_packs_epi32(cmp_hi_lo, cmp_hi_hi);
      auto cmp_lo_masked = _mm_and_si128(cmp_lo, lumaMask);
      auto cmp_hi_masked = _mm_and_si128(cmp_hi, lumaMask);

      auto res = _mm_packus_epi16(cmp_lo_masked, cmp_hi_masked);
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }

}


void check_combing_SSE2_Luma_Metric1(const uint8_t *srcp, uint8_t *dstp,
  int width, int height, int src_pitch, int dst_pitch, int cthreshsq)
{
  __m128i thresh = _mm_set1_epi32(cthreshsq);
  __m128i lumaMask = _mm_set1_epi16(0x00FF);
  __m128i zero = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto next = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch + x));
      
      next = _mm_and_si128(next, lumaMask);
      curr = _mm_and_si128(curr, lumaMask);
      prev = _mm_and_si128(prev, lumaMask);

      auto diff_prev_curr = _mm_subs_epi16(prev, curr);
      auto diff_next_curr = _mm_subs_epi16(next, curr);

      auto diff_prev_curr_lo = _mm_unpacklo_epi16(diff_prev_curr, zero);
      auto diff_prev_curr_hi = _mm_unpackhi_epi16(diff_prev_curr, zero);
      auto diff_next_curr_lo = _mm_unpacklo_epi16(diff_next_curr, zero);
      auto diff_next_curr_hi = _mm_unpackhi_epi16(diff_next_curr, zero);

      auto res_lo = _mm_madd_epi16(diff_prev_curr_lo, diff_next_curr_lo);
      auto res_hi = _mm_madd_epi16(diff_prev_curr_hi, diff_next_curr_hi);

      auto cmp_lo = _mm_cmpgt_epi32(res_lo, thresh);
      auto cmp_hi = _mm_cmpgt_epi32(res_hi, thresh);

      auto cmp = _mm_packs_epi32(cmp_lo, cmp_hi);
      auto cmp_masked = _mm_and_si128(cmp, lumaMask);

      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), cmp_masked);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}

template<int blockSizeY>
void compute_sum_8xN_sse2(const uint8_t *srcp, int pitch, int &sum)
{
  // sums masks
  // if (cmkppT[x + v] == 0xFF && cmkpT[x + v] == 0xFF && cmkpnT[x + v] == 0xFF) sum++;
  // scrp is prev
  auto onesMask = _mm_set1_epi8(1);
  auto all_ff = _mm_set1_epi8(-1);
  auto prev = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcp));
  auto curr = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcp + pitch));
  auto summa = _mm_setzero_si128();
  srcp += pitch * 2; // points to next
  // unroll 2
  for (int i = 0; i < blockSizeY / 2; i++) { // 4x2=8
    /*
    p  #
    c  # #
    n  # #
    nn   #
    */
    auto next = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcp));
    auto nextnext = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcp + pitch));

    auto anded_common = _mm_and_si128(curr, next);
    auto with_prev = _mm_and_si128(prev, anded_common);
    auto with_nextnext = _mm_and_si128(anded_common, nextnext);

    // these were missing from the original assembler code (== 0xFF)
    with_prev = _mm_cmpeq_epi8(with_prev, all_ff);
    with_nextnext = _mm_cmpeq_epi8(with_nextnext, all_ff);

    with_prev = _mm_and_si128(with_prev, onesMask);
    with_nextnext = _mm_and_si128(with_nextnext, onesMask);

    prev = next;
    curr = nextnext;

    summa = _mm_adds_epu8(summa, with_prev);
    summa = _mm_adds_epu8(summa, with_nextnext);
    srcp += pitch * 2;
  }
  // now we have to sum up lower 8 bytes
  // in sse2, we use sad
  auto zero = _mm_setzero_si128();
  auto tmpsum = _mm_sad_epu8(summa, zero);  // sum(lo 8 bytes)(needed) / sum(hi 8 bytes)(not needed)
  sum = _mm_cvtsi128_si32(tmpsum);
}

// instantiate for 8x8
template void compute_sum_8xN_sse2<8>(const uint8_t* srcp, int pitch, int& sum);

// YUY2 luma only case
void compute_sum_16x8_sse2_luma(const uint8_t *srcp, int pitch, int &sum)
{
  // sums masks
  // if (cmkppT[x + v] == 0xFF && cmkpT[x + v] == 0xFF && cmkpnT[x + v] == 0xFF) sum++;
  // scrp is prev
  auto onesMask = _mm_set1_epi16(0x0001); // ones where luma
  auto all_ff = _mm_set1_epi8(-1);
  auto prev = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp));
  auto curr = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + pitch));
  auto summa = _mm_setzero_si128();
  srcp += pitch * 2; // points to next
  for (int i = 0; i < 4; i++) { // 4x2=8
    /*
    p  #
    c  # #
    n  # #
    nn   #
    */
    auto next = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp));
    auto nextnext = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + pitch));

    auto anded_common = _mm_and_si128(curr, next);
    auto with_prev = _mm_and_si128(prev, anded_common);
    auto with_nextnext = _mm_and_si128(anded_common, nextnext);

    // these were missing from the original assembler code (== 0xFF)
    with_prev = _mm_cmpeq_epi8(with_prev, all_ff);
    with_nextnext = _mm_cmpeq_epi8(with_nextnext, all_ff);

    with_prev = _mm_and_si128(with_prev, onesMask);
    with_nextnext = _mm_and_si128(with_nextnext, onesMask);

    prev = next;
    curr = nextnext;

    summa = _mm_adds_epu8(summa, with_prev);
    summa = _mm_adds_epu8(summa, with_nextnext);
    srcp += pitch * 2;
  }

  // now we have to sum up lower and upper 8 bytes
  // in sse2, we use sad
  auto zero = _mm_setzero_si128();
  auto tmpsum = _mm_sad_epu8(summa, zero);  // sum(lo 8 bytes) / sum(hi 8 bytes)
  tmpsum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // lo + hi
  sum = _mm_cvtsi128_si32(tmpsum);
}

void copyFrame(VSFrameRef *dst, const VSFrameRef *src, const VSAPI *vsapi)
{
  // bit depth independent
    const VSFormat *format = vsapi->getFrameFormat(src);
  const int np = format->numPlanes;
  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    vs_bitblt(vsapi->getWritePtr(dst, plane), vsapi->getStride(dst, plane), vsapi->getReadPtr(src, plane),
      vsapi->getStride(src, plane), vsapi->getFrameWidth(src, plane) * format->bytesPerSample, vsapi->getFrameHeight(src, plane));
  }
}

// fast blend routine for 50:50 case
template<typename pixel_t>
void blend_5050_SSE2(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch)
{
  while (height--) {
    for (int x = 0; x < width * (int)sizeof(pixel_t); x += 16) {
      auto src1 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp1 + x));
      auto src2 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp2 + x));
      if constexpr (sizeof(pixel_t) == 1)
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), _mm_avg_epu8(src1, src2));
      else
        _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), _mm_avg_epu16(src1, src2));
    }
    dstp += dst_pitch;
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
  }
}
// instantiate
template void blend_5050_SSE2<uint8_t>(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);
template void blend_5050_SSE2<uint16_t>(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);

template<typename pixel_t>
void blend_5050_c(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch)
{
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
      reinterpret_cast<pixel_t*>(dstp)[x] = (reinterpret_cast<const pixel_t*>(srcp1)[x] + reinterpret_cast<const pixel_t*>(srcp2)[x] + 1) >> 1;
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }
}

// instantiate
template void blend_5050_c<uint8_t>(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);
template void blend_5050_c<uint16_t>(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);

// like HandleChromaCombing in TDeinterlace
// used by isCombedTIVTC as well
// mask only, no hbd needed
template<int planarType>
void do_FillCombedPlanarUpdateCmaskByUV(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV)
{
  // 420 only
  uint8_t* cmkpn = cmkp + cmk_pitch;
  uint8_t* cmkpp = cmkp - cmk_pitch;
  uint8_t* cmkpnn = cmkpn + cmk_pitch;

  uint8_t* cmkppU = cmkpU - cmk_pitchUV;
  uint8_t* cmkpnU = cmkpU + cmk_pitchUV;

  uint8_t* cmkppV = cmkpV - cmk_pitchUV;
  uint8_t* cmkpnV = cmkpV + cmk_pitchUV;
  for (int y = 1; y < Height - 1; ++y)
  {
    if (planarType == 420) {
      cmkp += cmk_pitch * 2;
      cmkpn += cmk_pitch * 2;
      cmkpp += cmk_pitch * 2;
      cmkpnn += cmk_pitch * 2;
    }
    else {
      cmkp += cmk_pitch;
    }
    cmkppV += cmk_pitchUV;
    cmkpV += cmk_pitchUV;
    cmkpnV += cmk_pitchUV;
    cmkppU += cmk_pitchUV;
    cmkpU += cmk_pitchUV;
    cmkpnU += cmk_pitchUV;
    for (int x = 1; x < Width - 1; ++x)
    {
      if (
        (cmkpV[x] == 0xFF &&
          (cmkpV[x - 1] == 0xFF || cmkpV[x + 1] == 0xFF ||
            cmkppV[x - 1] == 0xFF || cmkppV[x] == 0xFF || cmkppV[x + 1] == 0xFF ||
            cmkpnV[x - 1] == 0xFF || cmkpnV[x] == 0xFF || cmkpnV[x + 1] == 0xFF
            )
          ) ||
        (cmkpU[x] == 0xFF &&
          (cmkpU[x - 1] == 0xFF || cmkpU[x + 1] == 0xFF ||
            cmkppU[x - 1] == 0xFF || cmkppU[x] == 0xFF || cmkppU[x + 1] == 0xFF ||
            cmkpnU[x - 1] == 0xFF || cmkpnU[x] == 0xFF || cmkpnU[x + 1] == 0xFF
            )
          )
        )
      {
        if (planarType == 420) {
          ((uint16_t*)cmkp)[x] = (uint16_t)0xFFFF;
          ((uint16_t*)cmkpn)[x] = (uint16_t)0xFFFF;
          if (y & 1)
            ((uint16_t*)cmkpp)[x] = (uint16_t)0xFFFF;
          else
            ((uint16_t*)cmkpnn)[x] = (uint16_t)0xFFFF;
        }
        else if (planarType == 422) {
          ((uint16_t*)cmkp)[x] = (uint16_t)0xFFFF;
        }
        else if (planarType == 444) {
          cmkp[x] = 0xFF;
        }
        else if (planarType == 411) {
          ((uint32_t*)cmkp)[x] = (uint32_t)0xFFFFFFFF;
        }
      }
    }
  }
}

template void do_FillCombedPlanarUpdateCmaskByUV<411>(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV);
template void do_FillCombedPlanarUpdateCmaskByUV<420>(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV);
template void do_FillCombedPlanarUpdateCmaskByUV<422>(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV);
template void do_FillCombedPlanarUpdateCmaskByUV<444>(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV);
07070100000006000081A4000000000000000000000001671240C9000014ED000000000000000000000000000000000000003000000000vapoursynth-tivtc-2+2.g7abd4a3/src/TCommonASM.h/*
**   Helper methods for TIVTC and TDeint
**
**
**   Copyright (C) 2004-2007 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#ifndef __TCOMMONASM_H__
#define __TCOMMONASM_H__

#include <stdint.h>
#include "internal.h"
#include <xmmintrin.h>
#include <emmintrin.h>
#include <algorithm>
#include <VSHelper.h>
#include "cpufeatures.h"

template<int bits_per_pixel>
AVS_FORCEINLINE int cubicInt(int p1, int p2, int p3, int p4)
{
  const int max_pixel_value = (1 << bits_per_pixel) - 1;
  const int temp = (19 * (p2 + p3) - 3 * (p1 + p4) + 16) >> 5;
  return std::min(std::max(temp, 0), max_pixel_value);
}

void absDiff_SSE2(const uint8_t* srcp1, const uint8_t* srcp2,
  uint8_t* dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh1, int mthresh2);

void absDiff_c(const uint8_t* srcp1, const uint8_t* srcp2,
  uint8_t* dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh1, int mthresh2);

void absDiff_uint16_c(const uint8_t* srcp1, const uint8_t* srcp2,
  uint8_t* dstp, int src1_pitch, int src2_pitch, int dst_pitch, int width,
  int height, int mthresh);

template<typename pixel_t>
void check_combing_c(const pixel_t* srcp, uint8_t* dstp, int width, int height, int src_pitch, int dst_pitch, int cthresh);


template<typename pixel_t, typename safeint_t>
void check_combing_c_Metric1(const pixel_t* srcp, uint8_t* dstp, int width, int height, int src_pitch, int dst_pitch, safeint_t cthreshsq);

void check_combing_SSE2(const uint8_t *srcp, uint8_t *dstp,
  int width, int height, int src_pitch, int dst_pitch, int cthresh);

#if defined(GCC) || defined(CLANG)
__attribute__((__target__("sse4.1")))
#endif 
void check_combing_uint16_SSE4(const uint16_t* srcp, uint8_t* dstp, int width, int height, int src_pitch, int dst_pitch, int cthresh);

void check_combing_SSE2_Metric1(const uint8_t *srcp, uint8_t *dstp,
  int width, int height, int src_pitch, int dst_pitch, int cthreshsq);
  
void check_combing_SSE2_Luma_Metric1(const uint8_t *srcp, uint8_t *dstp,
  int width, int height, int src_pitch, int dst_pitch, int cthreshsq);

template<typename pixel_t>
void buildABSDiffMask_SSE2(const uint8_t *prvp, const uint8_t *nxtp,
  uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height);

template<typename pixel_t>
void buildABSDiffMask_c(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height);

template<typename pixel_t>
void do_buildABSDiffMask(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* tbuffer,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height, const CPUFeatures *cpuFlags);

template<typename pixel_t, int bits_per_pixel>
void AnalyzeDiffMask_Planar(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer, int tpitch, int Width, int Height);
void AnalyzeDiffMask_YUY2(uint8_t* dstp, int dst_pitch, uint8_t* tbuffer, int tpitch, int Width, int Height, bool mChroma);


void buildABSDiffMask2_uint8_SSE2(const uint8_t *prvp, const uint8_t *nxtp,
  uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height);

void buildABSDiffMask2_uint16_SSE2(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, int bits_per_pixel);

template<typename pixel_t>
void buildABSDiffMask2_c(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, int bits_per_pixel);

template<typename pixel_t>
void do_buildABSDiffMask2(const uint8_t* prvp, const uint8_t* nxtp, uint8_t* dstp,
  int prv_pitch, int nxt_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpuFlags, int bits_per_pixel);


template<int blockSizeY>
void compute_sum_8xN_sse2(const uint8_t *srcp, int pitch, int &sum);

void compute_sum_16x8_sse2_luma(const uint8_t *srcp, int pitch, int &sum);

// fixme: put non-asm utility functions into different file
void copyFrame(VSFrameRef *dst, const VSFrameRef *src, const VSAPI *vsapi);

template<typename pixel_t>
void blend_5050_SSE2(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);
template<typename pixel_t>
void blend_5050_c(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height, int dst_pitch, int src1_pitch, int src2_pitch);

template<int planarType>
void do_FillCombedPlanarUpdateCmaskByUV(uint8_t* cmkp, uint8_t* cmkpU, uint8_t* cmkpV, int Width, int Height, ptrdiff_t cmk_pitch, ptrdiff_t cmk_pitchUV);

#endif // __TCOMMONASM_H__
07070100000007000081A4000000000000000000000001671240C90001EF39000000000000000000000000000000000000003100000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimate.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2017-2018 pinterf
**   orgOut addition: (C)2018 8day
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include "TDecimateASM.h"
#include "TCommonASM.h"
#include <inttypes.h>
#include <algorithm>

const VSFrameRef *TDecimate::GetFrame(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core)
{
  if (n < 0) n = 0;
  else if (n > nfrmsN) n = nfrmsN;

  const VSFrameRef * dst = nullptr;

  try {
      if (mode < 2) dst = GetFrameMode01(n, activationReason, frameData, frameCtx, core);     // most similar/longest string
      else if (mode == 2) dst = GetFrameMode2(n, activationReason, frameData, frameCtx, core); // arbitrary framerate
      else if (mode == 3) dst = GetFrameMode3(n, activationReason, frameData, frameCtx, core); // single pass mkv-vfr
      else if (mode == 4) dst = GetFrameMode4(n, activationReason, frameCtx, core); // metrics output
      else if (mode == 5 || mode == 6) dst = GetFrameMode56(n, activationReason, frameCtx, core); // second pass of two pass hybrid
    //  else if (mode == 6) dst = GetFrameMode6(n, activationReason, frameCtx, core); // second pass for 120fps to vfr
      else if (mode == 7) dst = GetFrameMode7(n, activationReason, frameData, frameCtx, core); // arbitrary framerate v2
  } catch (const TIVTCError &e) {
      vsapi->setFilterError(e.what(), frameCtx);
      return nullptr;
  }

  return dst;
}


// For modes 0, 1, and 3
enum OutputType {
    SingleFrame = 0,
    TwoFramesBlended,
};

struct OutputInfo {
    OutputType type;
    int f1, f2;
    double a1, a2;
    std::vector<uint64_t> metrics;

    // For display only:
    int requested_frame_number; // requested from TDecimate
    int chosen_frame_number; // chosen from child/clip2 to return
    bool film;

    void set(OutputType _type, int _f1, int _f2, double _a1, double _a2, int _requested, int _chosen, bool _film) {
        type = _type;
        f1 = _f1;
        f2 = _f2;
        a1 = _a1;
        a2 = _a2;
        requested_frame_number = _requested;
        chosen_frame_number = _chosen;
        film = _film;
    }

    void requestFrames(VSNodeRef *clip, VSFrameContext *frameCtx, const VSAPI *vsapi) {
        vsapi->requestFrameFilter(f1, clip, frameCtx);

        if (type == TwoFramesBlended)
            vsapi->requestFrameFilter(f2, clip, frameCtx);
    }
};
////////////////////


// PF 180131 uses usehints! but no problem, its runtime
const VSFrameRef * TDecimate::GetFrameMode01(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core)
{
    if (activationReason != arInitial && activationReason != arAllFramesReady)
        return nullptr;

  int EvalGroup;
  if (hybrid != 3) EvalGroup = ((int)(n / (cycle - cycleR))) * cycle;
  else EvalGroup = ((int)(n / cycle)) * cycle;

  bool first_frame_in_cycle = hybrid != 3 ? n % (cycle - cycleR) == 0
                                          : n % cycle == 0;

  if (activationReason == arInitial) {
      for (int i = EvalGroup - cycle - 1; i < EvalGroup + (cycle * 3); i++)
          vsapi->requestFrameFilter(std::max(0, std::min(i, vi_child->numFrames - 1)), child, frameCtx);

      return nullptr;
  } else if (activationReason == arAllFramesReady && *frameData != nullptr) {
      const OutputInfo *o = (const OutputInfo *)*frameData;

      VSFrameRef *dst = nullptr;
      const VSFrameRef *frame1 = vsapi->getFrameFilter(o->f1, clip2, frameCtx);

      if (o->type == SingleFrame) {
          dst = vsapi->copyFrame(frame1, core);
      } else if (o->type == TwoFramesBlended) {
          const VSFrameRef *frame2 = vsapi->getFrameFilter(o->f2, clip2, frameCtx);
          dst = vsapi->newVideoFrame(vi_clip2->format, vi_clip2->width, vi_clip2->height, frame1, core);
          blendFrames(frame1, frame2, dst, o->a1);
          vsapi->freeFrame(frame2);
      }
      vsapi->freeFrame(frame1);

      if (display)
          displayOutput(dst, o->requested_frame_number, o->chosen_frame_number, o->film, o->a1, o->a2, o->f1, o->f2);

      VSMap *props = vsapi->getFramePropsRW(dst);

      if (first_frame_in_cycle) {
        vsapi->propSetInt(props, PROP_TDecimateCycleStart, EvalGroup, paReplace);
        vsapi->propSetIntArray(props, PROP_TDecimateCycleMaxBlockDiff, (const int64_t *)o->metrics.data(), cycle);
      }
      vsapi->propSetInt(props, PROP_TDecimateOriginalFrame, o->f1, paReplace);

      vsapi->propSetInt(props, PROP_DurationNum, vi.fpsDen, paReplace);
      vsapi->propSetInt(props, PROP_DurationDen, vi.fpsNum, paReplace);

      delete o;

      return dst;
  }

  // rerunFromStart is only executed if all the metrics are already available from the "input" file (fullInfo is true)
  // thus it never requests any frames, it always does calculations from the stored metrics
  if (n != lastn + 1 && EvalGroup >= cycle && fullInfo && (EvalGroup != curr.frame ||
    EvalGroup - cycle != prev.frame || EvalGroup + cycle != next.frame))
    rerunFromStart(EvalGroup, frameCtx, core);

  lastn = n;
//  if (ecf) child->SetCacheHints(EvalGroup, -20);
  if (curr.frame != EvalGroup)
  {
    prev = curr;
    if (prev.frame != EvalGroup - cycle)
    {
      prev.setFrame(EvalGroup - cycle);
      getOvrCycle(prev, false);
      calcMetricCycle(prev, true, true, core, frameCtx);
      if (hybrid > 0)
      {
        checkVideoMatches(prev, prev);
        checkVideoMetrics(prev, vidThresh);
      }
      if (output.size()) addMetricCycle(prev);
    }
    curr = next;
    if (curr.frame != EvalGroup)
    {
      curr.setFrame(EvalGroup);
      getOvrCycle(curr, false);
      calcMetricCycle(curr, true, true, core, frameCtx);
      if (hybrid > 0)
      {
        checkVideoMatches(prev, curr);
        checkVideoMetrics(curr, vidThresh);
      }
      if (output.size()) addMetricCycle(curr);
    }
    next = nbuf;
    if (next.frame != EvalGroup + cycle)
      next.setFrame(EvalGroup + cycle);
    getOvrCycle(next, false);
    calcMetricCycle(next, true, true, core, frameCtx);
    if (hybrid > 0)
    {
      checkVideoMatches(curr, next);
      checkVideoMetrics(next, vidThresh);
    }
    if (output.size()) addMetricCycle(next);
    nbuf.setFrame(EvalGroup + cycle * 2);
    getOvrCycle(nbuf, false);
    if (hybrid > 0 && curr.type > 1)
    {
      int scenetest = curr.sceneDetect(prev, next, sceneThreshU);
      bool isVid = ((curr.type == 2 || curr.type == 4) && !curr.isfilmd2v && // matches
        (prev.type == 5 || (prev.type == 2 && (vidDetect == 0 || vidDetect == 2)) || prev.type == 4 ||
          next.type == 5 || (next.type == 2 && (vidDetect == 0 || vidDetect == 2)) || next.type == 4 ||
          conCycle == 1 || scenetest != -20));
      bool isVid2 = ((curr.type == 3 || curr.type == 4) && !curr.isfilmd2v && // metrics
        (prev.type == 5 || (prev.type == 3 && (vidDetect == 1 || vidDetect == 2)) || prev.type == 4 ||
          next.type == 5 || (next.type == 3 && (vidDetect == 1 || vidDetect == 2)) || next.type == 4 ||
          conCycle == 1 || scenetest != -20));
      if (curr.type == 5 || (vidDetect == 0 && isVid) || (vidDetect == 1 && isVid2) ||
        (vidDetect == 2 && (isVid2 || isVid)) || (vidDetect == 3 && (isVid2 && isVid)))
      {
        int temp = curr.sceneDetect(prev, next, sceneThreshU);
        if (temp != -20 && hybrid != 3)
        {
          for (int p = curr.cycleS; p < curr.cycleE; ++p) curr.decimate[p] = curr.decimate2[p] = 0;
          curr.decimate[temp] = curr.decimate2[temp] = 1;
          curr.blend = 2;
          curr.decSet = true;
        }
        else curr.blend = 1;
      }
      else { goto novidjump; }
    }
    else
    {
    novidjump:
      if (mode == 0)
      {
        mostSimilarDecDecision(prev, curr, next);
      }
      else
      {
        prev.setDups(dupThresh);
        curr.setDups(dupThresh);
        next.setDups(dupThresh);
        findDupStrings(prev, curr, next);
      }
      if (curr.blend == 3)
      {
        int tscene = curr.sceneDetect(prev, next, sceneThreshU);
        if (tscene != -20 && curr.decimate[tscene] == 1 && hybrid != 3)
        {
          curr.decimate[tscene] = curr.decimate2[tscene] = 0;
          curr.blend = 0;
        }
      }
      if (curr.blend != 3) curr.blend = 0;
    }
//    if (debug) debugOutput1(n, curr.blend == 1 ? false : true, curr.blend);
  }
  for (int j = nbuf.cycleS; j < nbuf.cycleE; ++j)
  {
    if (nbuf.diffMetricsU[j] == UINT64_MAX || nbuf.diffMetricsUF[j] == UINT64_MAX ||
      nbuf.match[j] == -20)
    {
      calcMetricPreBuf(next.frameEO - 1 + j, next.frameEO + j, j, vi_child, true, true, frameCtx, core);
      break;
    }
  }
  
  OutputInfo *o = new OutputInfo;
  *frameData = (void *)o;

  if (first_frame_in_cycle) {
      o->metrics.assign(curr.diffMetricsU, curr.diffMetricsU + cycle);
//      o->metrics.resize(cycle);
//          memcpy(o->metrics.data(), curr.diffMetricsU, cycle * sizeof(*o->metrics.data()));
  }

  if (curr.blend == 3)  // 2 dups detected
  {
    if (hybrid == 3)  // blend up-convert (hybrid=3 leaves video untouched)
    {
      bool tsc = false;
      int tscene = curr.sceneDetect(prev, next, sceneThreshU);
      if (tscene == -20)
      {
        tscene = next.sceneDetect(sceneThreshU);
        if (tscene == 0 && next.diffMetricsUF[next.cycleS] > sceneThreshU &&
          curr.sceneDetect(sceneThreshU) == -20)
        {
          tscene = curr.length;
          tsc = true;
        }
        else tscene = -20;
      }
      else if (tscene == 0 && curr.diffMetricsUF[curr.cycleS] > sceneThreshU) tsc = true;
      double a1, a2; // a2 = 1.0 - a1
      int f1, f2;
      calcBlendRatios2(a1, a2, f1, f2, n, prev, curr, next, 2);

      o->type = SingleFrame;

      if (a1 >= 1.0)
      {
        // #1 is 100%
        o->f1 = f1;
      }
      else if (a2 >= 1.0)
      {
        // #2 is 100%
        o->f1 = f2;
      }
      else if (tscene >= 0 &&
        ((!tsc && (f1 == curr.frame + tscene || f2 == curr.frame + tscene + 1)) ||
          (tsc && (f1 == curr.frame + tscene - 1 || f2 == curr.frame + tscene))))
      {
        if (!tsc)
        {
          f1 = curr.frame + tscene;
          f2 = curr.frame + tscene + 1;
        }
        else
        {
          f1 = curr.frame + tscene - 1;
          f2 = curr.frame + tscene;
        }
        a1 = 1.0; // make #1 as 100%
        a2 = 0.0;

        o->f1 = f1;
      }
      else
      {
          o->type = TwoFramesBlended;
          o->f1 = f1;
          o->f2 = f2;
      }
//      if (debug) debugOutput2(n, 0, true, f1, f2, a1, a2);

      o->requested_frame_number = n;
      o->chosen_frame_number = 0;
      o->film = true;
      o->a1 = a1;
      o->a2 = a2;
      o->requestFrames(clip2, frameCtx, vsapi);
      return nullptr;
    }
    // drop one dup and replace the other with a blend of its neighbors
    // (if noblend=false)... or if one is next to a scenechange then just
    // leave it (will be much less noticeable than a blend).
    int ret = n % (cycle - cycleR), y, f1 = 0, f2 = 0, jk;
    double a1 = 0.0, a2 = 0.0;
    int tscene = curr.sceneDetect(prev, next, sceneThreshU);
    if (tscene != -20)
    {
      for (jk = -1, y = curr.cycleS; y < curr.cycleE; ++y)
      {
        if (curr.decimate[y] == 0) ++jk;
        if (y == tscene && jk < ret) ++jk;
        if (ret == jk)
        {
          ret = y;
          break;
        }
      }
    }
    else
    {
      int d1 = -20, d2 = -20;
      for (y = curr.cycleS; y < curr.cycleE; ++y)
      {
        if (curr.decimate[y] == 1 && d1 == -20) d1 = y;
        else if (curr.decimate[y] == 1 && d2 == -20) { d2 = y; break; }
      }
      if (curr.diffMetricsU[d1] > curr.diffMetricsU[d2]) d1 = d2;
      for (jk = 0, y = curr.cycleS; y < curr.cycleE; ++y)
      {
        if (ret == jk && y != d1)
        {
          if (curr.decimate[y] == 1)
          {
            f1 = curr.frameSO + y - 1;
            f2 = curr.frameSO + y + 1;
            a1 = a2 = 0.5; // 50-50%
          }
          else ret = y;
          break;
        }
        if (y != d1) ++jk;
      }
    }

    if (f1 != 0)
    {
//      if (debug) debugOutput2(n, 0, true, f1, f2, a1, a2);
      o->set(TwoFramesBlended, f1, f2, a1, a2, n, 0, true);
    } else {
//    if (debug) debugOutput2(n, curr.frame + ret, true, f1, f2, a1, a2);
      o->set(SingleFrame, curr.frame + ret, -69, a1, a2, n, curr.frame + ret, true);
    }

    o->requestFrames(clip2, frameCtx, vsapi);
    return nullptr;
    // end of curr_blend == 3
  }
  else if (curr.blend != 1)  // normal film (1 dup)
  {
    if (hybrid == 3)  // blend up-convert (hybrid=3 leaves video untouched)
    {
      bool tsc = false;
      int tscene = curr.sceneDetect(prev, next, sceneThreshU);
      if (tscene == -20)
      {
        tscene = next.sceneDetect(sceneThreshU);
        if (tscene == 0 && next.diffMetricsUF[next.cycleS] > sceneThreshU &&
          curr.sceneDetect(sceneThreshU) == -20)
        {
          tscene = curr.length;
          tsc = true;
        }
        else tscene = -20;
      }
      else if (tscene == 0 && curr.diffMetricsUF[curr.cycleS] > sceneThreshU) tsc = true;

      double a1, a2;
      int f1, f2;
      calcBlendRatios2(a1, a2, f1, f2, n, prev, curr, next, 1);

      o->type = SingleFrame;

      if (a1 >= 1.0)
      {
        o->f1 = f1;
      }
      else if (a2 >= 1.0)
      {
        o->f1 = f2;
      }
      else if (tscene >= 0 &&
        ((!tsc && (f1 == curr.frame + tscene || f2 == curr.frame + tscene + 1)) ||
        (tsc && (f1 == curr.frame + tscene - 1 || f2 == curr.frame + tscene))))
      {
        if (!tsc)
        {
          f1 = curr.frame + tscene;
          f2 = curr.frame + tscene + 1;
        }
        else
        {
          f1 = curr.frame + tscene - 1;
          f2 = curr.frame + tscene;
        }
        a1 = 1.0; // make #1 as 100%
        a2 = 0.0;

        o->f1 = f1;
      }
      else
      {
          o->type = TwoFramesBlended;
          o->f1 = f1;
          o->f2 = f2;
      }
//      if (debug) debugOutput2(n, 0, true, f1, f2, a1, a2);

      o->requested_frame_number = n;
      o->chosen_frame_number = 0;
      o->film = true;
      o->a1 = a1;
      o->a2 = a2;
      o->requestFrames(clip2, frameCtx, vsapi);
      return nullptr;
    }
    // normal drop operation
    int ret = curr.getNonDec(n % (cycle - cycleR));
    if (ret == -1)
    {
      curr.debugOutput();
      curr.debugMetrics(curr.length);
      vsapi->setFilterError("TDecimate:  major internal error. Couldn't figure out which frame to return. Please report this ASAP!", frameCtx);
      return nullptr;
    }
//    if (debug) debugOutput2(n, curr.frame + ret, curr.blend == 2 ? false : true, 0, 0, 0.0, 0.0);

    o->set(SingleFrame, curr.frame + ret, -69, 0.0, 0.0, n, curr.frame + ret, curr.blend != 2);
    o->requestFrames(clip2, frameCtx, vsapi);
    return nullptr;
  }
  else  // video (no dups)
  {
    if (hybrid == 3) // return source frame (hybrid=3 leaves video untouched)
    {
//      if (debug) debugOutput2(n, n, false, 0, 0, 0.0, 0.0);

        // So.... did it not drop any frames up to this one? That's the only way output frame n corresponds to input frame n.
      o->set(SingleFrame, n, -69, 0.0, 0.0, n, n, false);
      o->requestFrames(clip2, frameCtx, vsapi);
      return nullptr;
    }
    // blend down-convert (hybrid=1 leaves film untouched)

    double a1, a2;
    int f1, f2;
    calcBlendRatios(a1, a2, f1, f2, n, curr.frame, curr.cycleE - curr.cycleS);

    o->type = SingleFrame;

    if (a1 >= 1.0)
    {
      o->f1 = f1;
    }
    else if (a2 >= 1.0)
    {
      o->f1 = f2;
    }
    else
    {
        o->type = TwoFramesBlended;
        o->f1 = f1;
        o->f2 = f2;
    }

//    if (debug) debugOutput2(n, 0, false, f1, f2, a1, a2);

    o->requested_frame_number = n;
    o->chosen_frame_number = 0;
    o->film = false;
    o->a1 = a1;
    o->a2 = a2;
    o->requestFrames(clip2, frameCtx, vsapi);
    return nullptr;
  }
}

void setBlack(VSFrameRef *dst, const VSAPI *vsapi)
{
    const VSFormat *format = vsapi->getFrameFormat(dst);
  const int np = format->numPlanes;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    uint8_t* dstp = vsapi->getWritePtr(dst, plane);
    const int pitch = vsapi->getStride(dst, plane);
    const size_t height = vsapi->getFrameHeight(dst, plane);

    if (b == 0)
      memset(dstp, 0, pitch * height); // luma
    else {
      // chroma
      const int bits_per_pixel = format->bitsPerSample;
      if (bits_per_pixel == 8)
        memset(dstp, 128, pitch * height);
      else
        std::fill_n((uint16_t*)dstp, pitch * height / sizeof(uint16_t), 128 << (bits_per_pixel - 8));
    }
  }
}

const VSFrameRef * TDecimate::GetFrameMode3(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core)
{
  static int vidC = 0;
  static int filmC = 0;
  static int longestT = 0;
  static int longestV = 0;
  static int countVT = 0;
  static double timestamp = 0.0;

  if (activationReason != arInitial && activationReason != arAllFramesReady)
      return nullptr;

  if (activationReason == arInitial) {
      for (int i = lastCycle - 1; i < lastCycle + (cycle * 4); i++)
          vsapi->requestFrameFilter(std::max(0, std::min(i, vi_child->numFrames - 1)), child, frameCtx);

      return nullptr;
  } else if (activationReason == arAllFramesReady && *frameData != nullptr) {
      const OutputInfo *o = (const OutputInfo *)*frameData;

      VSFrameRef *dst = nullptr;
      const VSFrameRef *frame1 = vsapi->getFrameFilter(o->f1, clip2, frameCtx);

      if (o->type == SingleFrame) {
          dst = vsapi->copyFrame(frame1, core);
      } else if (o->type == TwoFramesBlended) {
          const VSFrameRef *frame2 = vsapi->getFrameFilter(o->f2, clip2, frameCtx);
          dst = vsapi->newVideoFrame(vi_clip2->format, vi_clip2->width, vi_clip2->height, frame1, core);
          blendFrames(frame1, frame2, dst, o->a1);
          vsapi->freeFrame(frame2);
      }
      vsapi->freeFrame(frame1);

      if (display)
          displayOutput(dst, o->requested_frame_number, o->chosen_frame_number, o->film, o->a1, o->a2, o->f1, o->f2);

      int64_t duration_num = vi.fpsDen;
      int64_t duration_den = vi.fpsNum;

      if (o->film) {
          int mul = cycle;
          int div = cycle - cycleR;
          if (curr.blend == 3)
              div--;

          muldivRational(&duration_num, &duration_den, mul, div);
      }

      VSMap *props = vsapi->getFramePropsRW(dst);

      vsapi->propSetInt(props, PROP_DurationNum, duration_num, paReplace);
      vsapi->propSetInt(props, PROP_DurationDen, duration_den, paReplace);

      delete o;

      return dst;
  }

  if (n == 0)
  {
    vidC = filmC = longestT = longestV = countVT = 0;
    timestamp = 0.0;
  }
  if (linearCount != n) {
      vsapi->setFilterError("TDecimate:  non-linear access detected in mode 3!", frameCtx);
      return nullptr;
  }
  ++linearCount;
  if (n == 0 || n - lastGroup == retFrames)
  {
    lastGroup = n;
    lastCycle += cycle;
//    if (ecf) child->SetCacheHints(lastCycle, -20);
    prev = curr;
    if (prev.frame != lastCycle - cycle)
    {
      prev.setFrame(lastCycle - cycle);
      getOvrCycle(prev, false);
      calcMetricCycle(prev, true, true, core, frameCtx);
      checkVideoMatches(prev, prev);
      checkVideoMetrics(prev, vidThresh);
      if (output.size()) addMetricCycle(prev);
    }
    curr = next;
    if (curr.frame != lastCycle)
    {
      curr.setFrame(lastCycle);
      getOvrCycle(curr, false);
      calcMetricCycle(curr, true, true, core, frameCtx);
      checkVideoMatches(prev, curr);
      checkVideoMetrics(curr, vidThresh);
      if (output.size()) addMetricCycle(curr);
    }
    next = nbuf;
    if (next.frame != lastCycle + cycle)
      next.setFrame(lastCycle + cycle);
    getOvrCycle(next, false);
    calcMetricCycle(next, true, true, core, frameCtx);
    checkVideoMatches(curr, next);
    checkVideoMetrics(next, vidThresh);
    if (output.size()) addMetricCycle(next);

    nbuf.setFrame(lastCycle + cycle * 2);
    getOvrCycle(nbuf, false);
    int scenetest = curr.sceneDetect(prev, next, sceneThreshU);
    bool isVid = ((curr.type == 2 || curr.type == 4) && !curr.isfilmd2v && // matches
      (prev.type == 5 || (prev.type == 2 && (vidDetect == 0 || vidDetect == 2)) || prev.type == 4 ||
        next.type == 5 || (next.type == 2 && (vidDetect == 0 || vidDetect == 2)) || next.type == 4 ||
        conCycle == 1 || scenetest != -20));
    bool isVid2 = ((curr.type == 3 || curr.type == 4) && !curr.isfilmd2v && // metrics
      (prev.type == 5 || (prev.type == 3 && (vidDetect == 1 || vidDetect == 2)) || prev.type == 4 ||
        next.type == 5 || (next.type == 3 && (vidDetect == 1 || vidDetect == 2)) || next.type == 4 ||
        conCycle == 1 || scenetest != -20));
    if (curr.type == 5 || (vidDetect == 0 && isVid) || (vidDetect == 1 && isVid2) ||
      (vidDetect == 2 && (isVid2 || isVid)) || (vidDetect == 3 && (isVid2 && isVid)))
    {
      retFrames = cycle;
      vidC += (curr.frame + cycle <= nfrms ? cycle : nfrms - curr.frame + 1);
      longestT += (curr.frame + cycle <= nfrms ? cycle : nfrms - curr.frame + 1);
      if (!tcfv1)
      {
        int stop = (lastCycle + cycle <= nfrms ? cycle : nfrms - lastCycle + 1);
        for (int u = 0; u < stop; ++u)
        {
          fprintf(mkvOutF, "%3.6f\n", timestamp);
          timestamp += 1000.0 / fps;
        }
      }
    }
    else
    {
      if (vfrDec != 1)
      {
        mostSimilarDecDecision(prev, curr, next);
      }
      else
      {
        prev.setDups(dupThresh);
        curr.setDups(dupThresh);
        next.setDups(dupThresh);
        findDupStrings(prev, curr, next);
      }
      filmC += (curr.frame + cycle <= nfrms ? cycle : nfrms - curr.frame + 1);
      if (retFrames == cycle)
      {
        if (longestT > longestV) longestV = longestT;
        ++countVT;
        longestT = 0;
      }
      if (curr.blend != 3)
      {
        if (!tcfv1)
        {
          int stop = (lastCycle + cycle <= nfrms ? cycle - cycleR : nfrms - lastCycle + 1 - cycleR);
          for (int u = 0; u < stop; ++u)
          {
            fprintf(mkvOutF, "%3.6f\n", timestamp);
            timestamp += 1000.0 / mkvfps;
          }
        }
        retFrames = cycle - cycleR;
      }
      else
      {
        if (lastType > 0 && tcfv1)
          fprintf(mkvOutF, "%d,%d,%4.6f\n", lastGroup - (lastType*(cycle - cycleR)), lastGroup - 1, mkvfps);
        if (!tcfv1)
        {
          int stop = (lastCycle + cycle <= nfrms ? cycle - cycleR - 1 : nfrms - lastCycle + 1 - cycleR - 1);
          for (int u = 0; u < stop; ++u)
          {
            fprintf(mkvOutF, "%3.6f\n", timestamp);
            timestamp += 1000.0 / mkvfps2;
          }
        }
        else fprintf(mkvOutF, "%d,%d,%4.6f\n", lastGroup, lastGroup + cycle - cycleR - 2, mkvfps2);
        retFrames = cycle - cycleR - 1;
      }
    }
    if (retFrames == cycle && lastType > 0 && tcfv1)
      fprintf(mkvOutF, "%d,%d,%4.6f\n", lastGroup - (lastType*(cycle - cycleR)), lastGroup - 1, mkvfps);
    if (retFrames == cycle - cycleR) ++lastType;
    else lastType = 0;
//    if (debug) debugOutput1(n, retFrames == cycle ? false : true, curr.blend);
  }

  for (int j = nbuf.cycleS; j < nbuf.cycleE; ++j)
  {
    if (nbuf.diffMetricsU[j] == UINT64_MAX || nbuf.diffMetricsUF[j] == UINT64_MAX ||
      nbuf.match[j] == -20)
    {
      calcMetricPreBuf(next.frameEO - 1 + j, next.frameEO + j, j, vi_child, true, true, frameCtx, core);
      break;
    }
  }

  if (retFrames == cycle)
  {
    if (lastCycle + (n - lastGroup) > nfrms)
    {
      retFrames = -1;
      lastFrame = n - 1;
      fprintf(mkvOutF, "# TDecimate Mode 3:  Last Frame = %d\n", lastFrame);
    }
    else
    {
//      if (debug) debugOutput2(n, lastCycle + (n - lastGroup), false, 0, 0, 0.0, 0.0);

      OutputInfo *o = new OutputInfo;
      *frameData = (void *)o;

      o->set(SingleFrame, lastCycle + (n - lastGroup), -69, 0.0, 0.0, n, lastCycle + (n - lastGroup), false);
      o->requestFrames(clip2, frameCtx, vsapi);
      return nullptr;
    }
  }
  else if (retFrames == cycle - cycleR || (curr.blend == 3 && retFrames == cycle - cycleR - 1))
  {
    int ret = curr.getNonDec(n - lastGroup);
    if ((ret >= 0 && curr.frame + ret > nfrms) || ret < 0)
    {
      retFrames = -1;
      lastFrame = n - 1;
      if (lastType > 0)
      {
        if (tcfv1) fprintf(mkvOutF, "%d,%d,%4.6f\n", lastGroup - ((lastType - 1)*(cycle - cycleR)), lastFrame, mkvfps);
        lastType = 0;
      }
      fprintf(mkvOutF, "# TDecimate Mode 3:  Last Frame = %d\n", lastFrame);
    }
    else
    {
//      if (debug) debugOutput2(n, curr.frame + ret, true, 0, 0, 0.0, 0.0);

      OutputInfo *o = new OutputInfo;
      *frameData = (void *)o;

      o->set(SingleFrame, curr.frame + ret, -69, 0.0, 0.0, n, curr.frame + ret, true);
      o->requestFrames(clip2, frameCtx, vsapi);
      return nullptr;
    }
  }

  if (retFrames == -1 && mkvOutF != nullptr)
  {
    double filmCf = ((double)(filmC) / (double)(nfrms + 1))*100.0;
    double videoCf = ((double)(vidC) / (double)(nfrms + 1))*100.0;
    fprintf(mkvOutF, "# vfr stats:  %05.2f%c film  %05.2f%c video\n", filmCf, '%', videoCf, '%');
    fprintf(mkvOutF, "# vfr stats:  %d - film  %d - video  %d - total\n", filmC, vidC, nfrms + 1);
    fprintf(mkvOutF, "# vfr stats:  longest vid section - %d frames\n", longestV);
    fprintf(mkvOutF, "# vfr stats:  # of detected vid sections - %d", countVT);
    fclose(mkvOutF);
    mkvOutF = nullptr;
  }

  if (retFrames <= -306 && se) {
      vsapi->setFilterError("TDecimate:  mode 3 finished (early termination)!", frameCtx);
      return nullptr;
  }

  if (retFrames <= -305)
  {
      // I refuse to copy the text drawing code.

      std::string last = "Mode 3:  Last Actual Frame = " + std::to_string(lastFrame);

      VSPlugin *std_plugin = vsapi->getPluginById("com.vapoursynth.std", core);
      VSPlugin *text_plugin = vsapi->getPluginById("com.vapoursynth.text", core);

      VSMap *args = vsapi->createMap();
      vsapi->propSetNode(args, "clip", clip2, paReplace);
      VSMap *ret = vsapi->invoke(std_plugin, "BlankClip", args);
      vsapi->clearMap(args);
      if (vsapi->getError(ret)) {
          std::string msg = "TDecimate: failed to invoke std.BlankClip to show this message: '" + last + "'. " + vsapi->getError(ret);
          vsapi->setFilterError(msg.c_str(), frameCtx);
          vsapi->freeMap(args);
          vsapi->freeMap(ret);
          return nullptr;
      }
      VSNodeRef *node = vsapi->propGetNode(ret, "clip", 0, nullptr);
      vsapi->freeMap(ret);
      vsapi->propSetNode(args, "clip", node, paReplace);
      vsapi->freeNode(node);
      node = nullptr;
      vsapi->propSetData(args, "text", last.c_str(), last.size(), paReplace);
      ret = vsapi->invoke(text_plugin, "Text", args);
      vsapi->freeMap(args);
      if (vsapi->getError(ret)) {
          std::string msg = "TDecimate: failed to invoke text.Text to show this message: '" + last + "'. " + vsapi->getError(ret);
          vsapi->setFilterError(msg.c_str(), frameCtx);
          vsapi->freeMap(ret);
          return nullptr;
      }
      node = vsapi->propGetNode(ret, "clip", 0, nullptr);
      vsapi->freeMap(ret);

      char error[160] = { 0 };
      const VSFrameRef *dst = vsapi->getFrame(0, node, error, 160);
      vsapi->freeNode(node);
      if (dst == nullptr) {
          std::string msg = "TDecimate: failed to generate the frame with this message: '" + last + "'. " + error;
          vsapi->setFilterError(msg.c_str(), frameCtx);
          return nullptr;
      }

      --retFrames;
      return dst;
  }

  return nullptr; // Should not be reachable.
}

const VSFrameRef * TDecimate::GetFrameMode4(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core)
{
  if (activationReason == arInitial) {
      vsapi->requestFrameFilter(n > 0 ? n - 1 : 0, child, frameCtx);
      vsapi->requestFrameFilter(n, child, frameCtx);

      vsapi->requestFrameFilter(n, clip2, frameCtx);

      return nullptr;
  } else if (activationReason != arAllFramesReady) {
      return nullptr;
  }

  const VSFrameRef * prv = vsapi->getFrameFilter(n > 0 ? n - 1 : 0, child, frameCtx);
  const VSFrameRef * src = vsapi->getFrameFilter(n, child, frameCtx);
  int blockN = -20, xblocks;
  uint64_t metricU = UINT64_MAX, metricF = UINT64_MAX;
  getOvrFrame(n, metricU, metricF);
  if (metricU == UINT64_MAX || metricF == UINT64_MAX || display)
    metricU = calcMetric(prv, src, vi_child, blockN, xblocks, metricF, true, core);

  vsapi->freeFrame(prv);

  double metricN = (metricU*100.0) / MAX_DIFF;
//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  frame %d  metric = %3.2f  metricF =  %" PRIu64 " (%3.2f)", n, metricN, metricF,
//      (double)metricF*100.0 / (double)sceneDivU);
//    OutputDebugString(buf);
//  }
  if (output.size() && metricsOutArray.size())
  {
    metricsOutArray[n << 1] = metricU;
    metricsOutArray[(n << 1) + 1] = metricF;
  }

  vsapi->freeFrame(src);
  src = vsapi->getFrameFilter(n, clip2, frameCtx);

  VSFrameRef *dst = vsapi->copyFrame(src, core);
  vsapi->freeFrame(src);

  VSMap *props = vsapi->getFramePropsRW(dst);

  if (display)
  {
//    if (blockN != -20) drawBox(src, blockx, blocky, blockN, xblocks, vi_clip2); /// figure out what drawBox does

#define SZ 160
    char buf[SZ] = { 0 };

    std::string text = "TDecimate " VERSION " by tritical\n";

    text += "Mode: 4 (metrics output)\n";
    snprintf(buf, SZ, "chroma = %s  denoise = %s\n", chroma ? "true" : "false",
      predenoise ? "true" : "false");
    text += buf;
    snprintf(buf, SZ, "Frame %d:  %3.2f  %3.2f\n", n, metricN, (double)metricF*100.0 / (double)sceneDivU);
    text += buf;
#undef SZ

      vsapi->propSetData(props, PROP_TDecimateDisplay, text.c_str(), text.size(), paReplace);
  }
  return dst;
}

const VSFrameRef * TDecimate::GetFrameMode56(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core)
{
  int frame = aLUT[n];
  int durNum = frame_duration_info[frame].first;
  int durDen = frame_duration_info[frame].second;

  if (activationReason == arInitial) {
      vsapi->requestFrameFilter(frame, clip2, frameCtx);

      return nullptr;
  } else if (activationReason != arAllFramesReady) {
      return nullptr;
  }

  const VSFrameRef *src = vsapi->getFrameFilter(frame, clip2, frameCtx);

//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  inframe = %d  useframe = %d  (mode = %d)", n, frame, mode);
//    OutputDebugString(buf);
//  }

  VSFrameRef *dst = vsapi->copyFrame(src, core);
  vsapi->freeFrame(src);
  VSMap *props = vsapi->getFramePropsRW(dst);

  if (display)
  {
#define SZ 160
    char buf[SZ] = { 0 };

    std::string text = "TDecimate " VERSION " by tritical\n";

    if (mode == 5)
        snprintf(buf, SZ, "Mode: %d (vfr)  Hybrid = %d\n", mode, hybrid);
    else
        snprintf(buf, SZ, "Mode: %d (120fps -> vfr)\n", mode);
    text += buf;
    snprintf(buf, SZ, "inframe = %d  useframe = %d\n", n, frame);
    text += buf;
#undef SZ
    vsapi->propSetData(props, PROP_TDecimateDisplay, text.c_str(), text.size(), paReplace);
  }

  vsapi->propSetInt(props, PROP_DurationNum, durNum, paReplace);
  vsapi->propSetInt(props, PROP_DurationDen, durDen, paReplace);

  return dst;
}

// PF 180131 uses usehints! but its runtime alreadz, no problem
void TDecimate::rerunFromStart(const int s, VSFrameContext *frameCtx, VSCore *core)
{
  int EvalGroup = 0;
  while (EvalGroup < s)
  {
    prev = curr;
    if (prev.frame != EvalGroup - cycle)
    {
      prev.setFrame(EvalGroup - cycle);
      getOvrCycle(prev, false);
      calcMetricCycle(prev, true, true, core, frameCtx);
      if (hybrid > 0)
      {
        checkVideoMatches(prev, prev);
        checkVideoMetrics(prev, vidThresh);
      }
    }
    curr = next;
    if (curr.frame != EvalGroup)
    {
      curr.setFrame(EvalGroup);
      getOvrCycle(curr, false);
      calcMetricCycle(curr, true, true, core, frameCtx);
      if (hybrid > 0)
      {
        checkVideoMatches(prev, curr);
        checkVideoMetrics(curr, vidThresh);
      }
    }
    next.setFrame(EvalGroup + cycle);
    getOvrCycle(next, false);
    calcMetricCycle(next, true, true, core, frameCtx);
    if (hybrid > 0)
    {
      checkVideoMatches(curr, next);
      checkVideoMetrics(next, vidThresh);
    }
    if (hybrid > 0 && curr.type > 1)
    {
      int scenetest = curr.sceneDetect(prev, next, sceneThreshU);
      bool isVid = ((curr.type == 2 || curr.type == 4) && !curr.isfilmd2v && // matches
        (prev.type == 5 || (prev.type == 2 && (vidDetect == 0 || vidDetect == 2)) || prev.type == 4 ||
          next.type == 5 || (next.type == 2 && (vidDetect == 0 || vidDetect == 2)) || next.type == 4 ||
          conCycle == 1 || scenetest != -20));
      bool isVid2 = ((curr.type == 3 || curr.type == 4) && !curr.isfilmd2v && // metrics
        (prev.type == 5 || (prev.type == 3 && (vidDetect == 1 || vidDetect == 2)) || prev.type == 4 ||
          next.type == 5 || (next.type == 3 && (vidDetect == 1 || vidDetect == 2)) || next.type == 4 ||
          conCycle == 1 || scenetest != -20));
      if (curr.type == 5 || (vidDetect == 0 && isVid) || (vidDetect == 1 && isVid2) ||
        (vidDetect == 2 && (isVid2 || isVid)) || (vidDetect == 3 && (isVid2 && isVid)))
      {
        int temp = curr.sceneDetect(prev, next, sceneThreshU);
        if (temp != -20 && hybrid != 3)
        {
          for (int p = curr.cycleS; p < curr.cycleE; ++p) curr.decimate[p] = curr.decimate2[p] = 0;
          curr.decimate[temp] = curr.decimate2[temp] = 1;
          curr.blend = 2;
          curr.decSet = true;
        }
        else curr.blend = 1;
      }
      else { goto novidjump; }
    }
    else
    {
    novidjump:
      if (mode == 0) mostSimilarDecDecision(prev, curr, next);
      else
      {
        prev.setDups(dupThresh);
        curr.setDups(dupThresh);
        next.setDups(dupThresh);
        findDupStrings(prev, curr, next);
      }
      if (curr.blend == 3)
      {
        int tscene = curr.sceneDetect(prev, next, sceneThreshU);
        if (tscene != -20 && curr.decimate[tscene] == 1 && hybrid != 3)
        {
          curr.decimate[tscene] = curr.decimate2[tscene] = 0;
          curr.blend = 0;
        }
      }
      if (curr.blend != 3) curr.blend = 0;
    }
    EvalGroup += cycle;
  }
}

void TDecimate::calcMetricPreBuf(int n1, int n2, int pos, const VSVideoInfo *vit, bool scene,
  bool gethint, VSFrameContext *frameCtx, VSCore *core)
{
  if (n2 > nbuf.maxFrame || n2 < 0) return;
//  if (n2 < nbuf.frameSO || n2 >= nbuf.frameEO || n1 != n2 - 1 ||
//    nbuf.frameSO + pos != n2)
//    env->ThrowError("TDecimate:  internal error during pre-buffering (n1=%d,n2=%d,pos=%d,nbuf.FrameSO=%d,nBuf.frameEO=%d)!",
//      n1, n2, pos, nbuf.frameSO, nbuf.frameEO);
  if (n2 == 0) n1 = 0;
  int blockNI, xblocksI;
  uint64_t metricF;
  const VSFrameRef *src = nullptr;
  if (nbuf.diffMetricsU[pos] == UINT64_MAX ||
    (nbuf.diffMetricsUF[pos] == UINT64_MAX && scene))
  {
    src = vsapi->getFrameFilter(n2, child, frameCtx);
    const VSFrameRef *frame = vsapi->getFrameFilter(n1, child, frameCtx);
    nbuf.diffMetricsU[pos] = calcMetric(frame, src, vit, blockNI, xblocksI, metricF, scene, core);
    vsapi->freeFrame(frame);
    nbuf.diffMetricsN[pos] = (nbuf.diffMetricsU[pos] * 100.0) / MAX_DIFF;
    if (scene) nbuf.diffMetricsUF[pos] = metricF;
  }
  if (gethint && nbuf.match[pos] == -20)
  {
    if (!usehints) nbuf.match[pos] = -200;
    else
    {
      if (!src)
        src = vsapi->getFrameFilter(n2, child, frameCtx);

      nbuf.match[pos] = getTFMFrameProperties(src, nbuf.filmd2v[pos]);

    }
  }
  vsapi->freeFrame(src);
}

void CalcMetricsExtracted(const VSFrameRef *prevt, const VSFrameRef *currt, CalcMetricData& d, VSCore *core, const VSAPI *vsapi)
{
  VSFrameRef *prev = nullptr, *curr = nullptr;

  if (d.predenoise)
  {
    prev = vsapi->newVideoFrame(d.vi.format, d.vi.width, d.vi.height, nullptr, core);
    curr = vsapi->newVideoFrame(d.vi.format, d.vi.width, d.vi.height, nullptr, core);
    blurFrame(prevt, prev, 2, d.chroma, d.cpuFlags, core, vsapi);
    blurFrame(currt, curr, 2, d.chroma, d.cpuFlags, core, vsapi);
  }
  else
  {
    prev = vsapi->copyFrame(prevt, core);
    curr = vsapi->copyFrame(currt, core);
  }

  // core start

  const uint8_t* prvp, * curp;
  int prv_pitch, cur_pitch, width, height;

  int xblocks = ((d.vi.width + d.blockx_half) >> d.blockx_shift) + 1;
  int xblocks4 = xblocks << 2;
  int yblocks = ((d.vi.height + d.blocky_half) >> d.blocky_shift) + 1;
  int arraysize = (xblocks * yblocks) << 2;

  const bool use_sse2 = d.cpuFlags->sse2;

  memset(d.diff, 0, arraysize * sizeof(uint64_t));

  const int stop = !d.chroma ? 1 : d.vi.format->numPlanes; // luma only (!chroma) only 1 planar planes

  const int pixelsize = d.vi.format->bytesPerSample;

  for (int b = 0; b < stop; ++b)
  {
    const int plane = b;
    prvp = vsapi->getReadPtr(prev, plane);
    prv_pitch = vsapi->getStride(prev, plane) / pixelsize;
    width = vsapi->getFrameWidth(prev, plane);
    height = vsapi->getFrameHeight(prev, plane);
    curp = vsapi->getReadPtr(curr, plane);
    cur_pitch = vsapi->getStride(curr, plane) / pixelsize;

    // sum is gathered in uint64_t diff
    // diff[] entries are normalized back to 8 bit

    if (pixelsize == 1 && d.blockx == 32 && d.blocky == 32 && d.nt <= 0)
    {
      if (d.ssd && use_sse2)
        calcDiffSSD_32x32_SSE2(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, &d.vi);
      else if (!d.ssd && use_sse2)
        calcDiffSAD_32x32_SSE2(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, &d.vi);
      else { goto use_c; }
    }
    else if (pixelsize == 1 && d.blockx >= 16 && d.blocky >= 16 && d.nt <= 0)
    {
      // YUY2 block size 8 is really 16 in width because luma + chroma
      if (d.ssd && use_sse2)
        calcDiffSSD_Generic_SSE2(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, &d.vi);
      else if (!d.ssd && use_sse2)
        calcDiffSAD_Generic_SSE2(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, &d.vi);
      else { goto use_c; }
    }
    else
    {
      // fixme: have calcDiffSSD uint16_t to SIMD.
    use_c:
      if (pixelsize == 1) {
        if (!d.ssd) {
          // SAD
            calcDiff_SADorSSD_Generic_c<uint8_t, true, 1>(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, d.nt, &d.vi);
        }
        else {
          // SSD
            calcDiff_SADorSSD_Generic_c<uint8_t, false, 1>(prvp, curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, d.nt, &d.vi);
        }
      }
      else {
        // pixelsize == 2, 10-16 bits
        if (!d.ssd) {
          // SAD
          calcDiff_SADorSSD_Generic_c<uint16_t, true, 1>((const uint16_t *)prvp, (const uint16_t*)curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, d.nt, &d.vi);
        }
        else {
          // SSD
          calcDiff_SADorSSD_Generic_c<uint16_t, false, 1>((const uint16_t*)prvp, (const uint16_t*)curp, prv_pitch, cur_pitch, width, height, plane, xblocks4, d.diff, d.chroma, d.blockx_shift, d.blocky_shift, d.blockx_half, d.blocky_half, d.nt, &d.vi);
        }
      }
    }

    if (d.metricF_needed) { // called from TDecimate. from FrameDiff:false
      if (b == 0) // luma
      {
        *d.metricF = 0;
        if (d.scene)
        {
          // planar or YUY2 luma+chroma
          if (true)
          // fix in v18: v17 was: !d.chroma instead of d.chroma
          {
            for (int x = 0; x < arraysize; x += 4)
              *d.metricF += d.diff[x];
            // d.diff entries are normalized back to 8 bit video world, done inside calcDiff_SADorSSD_Generic_c
          }
        }
      }
    }
  }

  vsapi->freeFrame(prev);
  vsapi->freeFrame(curr);
}

uint64_t TDecimate::calcMetric(const VSFrameRef *prevt, const VSFrameRef *currt, const VSVideoInfo *vit, int &blockNI,
  int &xblocksI, uint64_t &metricF, bool scene, VSCore *core) const
{
  uint64_t highestDiff = 0;

  struct CalcMetricData d;
  //d.np = np;
  d.predenoise = predenoise;
  d.vi = *vit;
  d.chroma = chroma;
  d.cpuFlags = &cpuFlags;
  d.blockx = blockx;
  d.blockx_half = blockx_half;
  d.blockx_shift = blockx_shift;
  d.blocky = blocky;
  d.blocky_half = blocky_half;
  d.blocky_shift = blocky_shift;
  d.diff = diff.get();
  d.nt = nt;
  d.ssd = ssd;

  d.metricF_needed = true;
  d.metricF = &metricF;
  d.scene = scene; 

  CalcMetricsExtracted(prevt, currt, d, core, vsapi);

  int xblocks = ((d.vi.width + d.blockx_half) >> d.blockx_shift) + 1;
  int xblocks4 = xblocks << 2;
  int yblocks = ((d.vi.height + d.blocky_half) >> d.blocky_shift) + 1;
  int arraysize = (xblocks * yblocks) << 2;

  // output parameters
  blockNI = -20;
  xblocksI = xblocks4;

  for (int x = 0; x < arraysize; ++x)
  {
    if (diff.get()[x] > highestDiff)
    {
      highestDiff = diff.get()[x];
      blockNI = x;
    }
  }
  if (blockNI == -20) blockNI = 0;
  if (ssd)
  {
    highestDiff = (uint64_t)(sqrt((double)(highestDiff)));
    metricF = (uint64_t)(sqrt((double)(metricF)));
  }
  return highestDiff;
}

// PF 180131 uses usehints!
void TDecimate::calcMetricCycle(Cycle &current, bool scene, bool hnt, VSCore *core, VSFrameContext *frameCtx) const
{
  if (current.mSet || current.cycleS == current.cycleE) 
    return;
  
  int i, w;
  uint64_t highestDiff;
  int next_num = -20, next_numd = -20;

  VSFrameRef *prv = nullptr, *nxt = nullptr;
  const VSFrameRef *prevt = nullptr, *nextt = nullptr;
  if (predenoise)
  {
    prv = vsapi->newVideoFrame(vi_child->format, vi_child->width, vi_child->height, nullptr, core);
    nxt = vsapi->newVideoFrame(vi_child->format, vi_child->width, vi_child->height, nullptr, core);
  }

  for (w = current.frameSO, i = current.cycleS; i < current.cycleE; ++i, ++w)
  {
    if ((current.match[i] != -20 || !hnt) && current.diffMetricsU[i] != UINT64_MAX &&
      (current.diffMetricsUF[i] != UINT64_MAX || !scene)) continue;
    if (predenoise)
    {
      if (current.diffMetricsU[i] != UINT64_MAX &&
        (current.diffMetricsUF[i] != UINT64_MAX || !scene))
      {
        if (current.match[i] == -20 && hnt)
        {
          if (!usehints) current.match[i] = -200;
          else
          {
              vsapi->freeFrame(nextt);
            if (frameCtx)
                nextt = vsapi->getFrameFilter(w, child, frameCtx);
            else
                nextt = vsapi->getFrame(w, child, nullptr, 0);
            next_num = w;
            current.match[i] = getTFMFrameProperties(nextt, current.filmd2v[i]);
          }
        }
        continue;
      }
      
      vsapi->freeFrame(prevt);
      if (next_num == w - 1)
        prevt = vsapi->cloneFrameRef(nextt);
      else {
          if (frameCtx)
            prevt = vsapi->getFrameFilter(w > 0 ? w - 1 : 0, child, frameCtx);
          else
            prevt = vsapi->getFrame(w > 0 ? w - 1 : 0, child, nullptr, 0);
      }

      vsapi->freeFrame(nextt);
      if (frameCtx)
        nextt = vsapi->getFrameFilter(w, child, frameCtx);
      else
        nextt = vsapi->getFrame(w, child, nullptr, 0);
      next_num = w;
      if (current.match[i] == -20 && hnt)
      {
        if (!usehints) current.match[i] = -200;
        else current.match[i] = getTFMFrameProperties(nextt, current.filmd2v[i]);
      }
      if (next_numd == w - 1) 
        copyFrame(prv, nxt, vsapi);
      else 
        blurFrame(prevt, prv, 2, chroma, &cpuFlags, core, vsapi);
      
      blurFrame(nextt, nxt, 2, chroma, &cpuFlags, core, vsapi);
      next_numd = w;
    }
    else
    {
      if (current.diffMetricsU[i] != UINT64_MAX &&
        (current.diffMetricsUF[i] != UINT64_MAX || !scene))
      {
        if (current.match[i] == -20 && hnt)
        {
          if (!usehints) current.match[i] = -200;
          else
          {
            const VSFrameRef *tmp;
            if (frameCtx)
                tmp = vsapi->getFrameFilter(w, child, frameCtx);
            else
                tmp = vsapi->getFrame(w, child, nullptr, 0);
            vsapi->freeFrame(nxt);
            nxt = vsapi->copyFrame(tmp, core);
            vsapi->freeFrame(tmp);
            next_num = w;
            current.match[i] = getTFMFrameProperties(nxt, current.filmd2v[i]);
          }
        }
        continue;
      }

      vsapi->freeFrame(prv);
      if (next_num == w - 1) 
        prv = vsapi->copyFrame(nxt, core);
      else {
        const VSFrameRef *tmp;
        if (frameCtx)
            tmp = vsapi->getFrameFilter(w > 0 ? w - 1 : 0, child, frameCtx);
        else
            tmp = vsapi->getFrame(w > 0 ? w - 1 : 0, child, nullptr, 0);
        prv = vsapi->copyFrame(tmp, core);
        vsapi->freeFrame(tmp);
      }
      const VSFrameRef *tmp;
      if (frameCtx)
          tmp = vsapi->getFrameFilter(w, child, frameCtx);
      else
          tmp = vsapi->getFrame(w, child, nullptr, 0);
      vsapi->freeFrame(nxt);
      nxt = vsapi->copyFrame(tmp, core);
      vsapi->freeFrame(tmp);
      next_num = w;
      if (current.match[i] == -20 && hnt)
      {
        if (!usehints) current.match[i] = -200;
        else current.match[i] = getTFMFrameProperties(nxt, current.filmd2v[i]);
      }
    }

    struct CalcMetricData d;
    //d.np = np;
    d.predenoise = false; // done earlier
    d.vi = *vi_child;
    d.chroma = chroma;
    d.cpuFlags = &cpuFlags;
    d.blockx = blockx;
    d.blockx_half = blockx_half;
    d.blockx_shift = blockx_shift;
    d.blocky = blocky;
    d.blocky_half = blocky_half;
    d.blocky_shift = blocky_shift;
    d.diff = diff.get();
    d.nt = nt;
    d.ssd = ssd;

    // here we need metrics and has scene
    d.metricF_needed = true;
    d.metricF = &current.diffMetricsUF[i];
    d.scene = scene;

    CalcMetricsExtracted(prv, nxt, d, core, vsapi);

    int xblocks = ((d.vi.width + d.blockx_half) >> d.blockx_shift) + 1;
    int yblocks = ((d.vi.height + d.blocky_half) >> d.blocky_shift) + 1;
    int arraysize = (xblocks * yblocks) << 2;

    highestDiff = 0;
    for (int x = 0; x < arraysize; ++x)
    {
      if (diff.get()[x] > highestDiff)
        highestDiff = diff.get()[x];
    }
    if (ssd)
    {
      highestDiff = (uint64_t)(sqrt((double)(highestDiff)));
      current.diffMetricsUF[i] = (uint64_t)(sqrt((double)(current.diffMetricsUF[i])));
    }
    current.diffMetricsU[i] = highestDiff;
    current.diffMetricsN[i] = (highestDiff * 100.0) / MAX_DIFF;
  }

  vsapi->freeFrame(prevt);
  vsapi->freeFrame(nextt);
  vsapi->freeFrame(prv);
  vsapi->freeFrame(nxt);

  current.mSet = true;
  current.setIsFilmD2V();
}

template<bool SAD>
void calcLumaDiffYUY2_SADorSSD_c(const uint8_t* prvp, const uint8_t* nxtp,
  int width, int height, int prv_pitch, int nxt_pitch, int nt, uint64_t& diff) {

  if (width <= 0)
    return;
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; x += 2)
    {
      int temp;
      if constexpr (SAD)
        temp = abs(prvp[x] - nxtp[x]); // SAD
      else {
        temp = prvp[x] - nxtp[x];
        temp *= temp; // SSD
      }
      if (temp > nt) diff += temp;
      diff += temp;
    }
    prvp += prv_pitch;
    nxtp += nxt_pitch;
  }
}

//template<bool SAD>
//uint64_t calcLumaDiffYUY2_SADorSSD(const uint8_t* prvp, const uint8_t* nxtp,
//  int width, int height, int prv_pitch, int nxt_pitch, int nt, int cpuFlags)
//{
//  uint64_t diff = 0;

//  const bool use_sse2 = (cpuFlags & CPUF_SSE2) ? true : false;

//  int widtha;

//  if (use_sse2 && (nt == 0) && width >= 16) {
//    widtha = (width / 16) * 16;
//    if constexpr(SAD)
//      calcLumaDiffYUY2SAD_SSE2_16(prvp, nxtp, widtha, height, prv_pitch, nxt_pitch, diff);
//    else
//      calcLumaDiffYUY2SSD_SSE2_16(prvp, nxtp, widtha, height, prv_pitch, nxt_pitch, diff);

//    calcLumaDiffYUY2_SADorSSD_c<SAD>(prvp + widtha, nxtp + widtha, width - widtha, height, prv_pitch, nxt_pitch, nt, diff);
//  }
//  else
//  {
//    calcLumaDiffYUY2_SADorSSD_c<SAD>(prvp, nxtp, width, height, prv_pitch, nxt_pitch, nt, diff);
//  }

//  return diff;
//}

//uint64_t calcLumaDiffYUY2_SAD(const uint8_t* prvp, const uint8_t* nxtp,
//  int width, int height, int prv_pitch, int nxt_pitch, int nt, int cpuFlags)
//{
//  return calcLumaDiffYUY2_SADorSSD<true>(prvp, nxtp, width, height, prv_pitch, nxt_pitch, nt, cpuFlags);
//}

//uint64_t calcLumaDiffYUY2_SSD(const uint8_t* prvp, const uint8_t* nxtp,
//  int width, int height, int prv_pitch, int nxt_pitch, int nt, int cpuFlags)
//{
//  return calcLumaDiffYUY2_SADorSSD<false>(prvp, nxtp, width, height, prv_pitch, nxt_pitch, nt, cpuFlags);
//}

int TDecimate::getTFMFrameProperties(const VSFrameRef *src, int& d2vfilm) const
{
    const VSMap *props = vsapi->getFramePropsRO(src);
    int err;

  int match = int64ToIntS(vsapi->propGetInt(props, PROP_TFMMATCH, 0, &err));
  if (err)
      match = -200;

  d2vfilm = int64ToIntS(vsapi->propGetInt(props, PROP_TFMD2VFilm, 0, &err));
  if (err)
      d2vfilm = 0;

  int field = int64ToIntS(vsapi->propGetInt(props, PROP_TFMField, 0, &err));
  if (err)
      field = 0;

  if (match != -200 && field != 0)
  {
    if (match == 0) match = 3;
    else if (match == 2) match = 4;
    else if (match == 3) match = 0;
    else if (match == 4) match = 2;
  }

  return match;
}

/*
**  This function checks to see if there is a single match dup in the
**  current cycle and if that frame also has the lowest metric in the
**  cycle.  If those conditions are true, then it checks to see if
**  there is also such a frame in the previous or next cycle.  If
**  there is, and if it is in the same position as the one in the
**  current cycle, then the frame in the current cycle is chosen for
**  decimation.
**
**  This function is only used by longest string.
*/
bool TDecimate::checkForObviousDecFrame(Cycle &p, Cycle &c, Cycle &n)
{
  int i, v, dups = 0, mc = ISC, mp = ISC, saved = -20, saved2 = -20;
  uint64_t lowest_metric = UINT64_MAX;
  for (i = c.cycleS; i < c.cycleE; ++i)
  {
    mp = i == c.cycleS ? (p.cycleE > 0 ? p.match[p.cycleE - 1] : -20)
                       : mc;
    mc = c.match[i];
    if (checkMatchDup(mp, mc)) ++dups;
    if (dups > 1) return false;
    if (dups == 1 && saved == -20) saved = i;
    if (c.diffMetricsU[i] < lowest_metric)
    {
      lowest_metric = c.diffMetricsU[i];
      saved2 = i;
    }
  }
  if (dups != 1) return false;
  if (saved != saved2 || saved2 == -20) return false;
  lowest_metric = UINT64_MAX;
  int cp = -20, cn = -20;
  for (dups = 0, v = -1, i = p.cycleS; i < p.cycleE; ++i)
  {
    mp = i == p.cycleS ? -20 : mc;
    mc = p.match[i];
    if (checkMatchDup(mp, mc)) { ++dups; cp = i; }
    if (p.diffMetricsU[i] < lowest_metric) { v = i; lowest_metric = p.diffMetricsU[i]; }
  }
  if (dups == 0 && v == p.cycleS) { dups = 1; cp = v; }
  if (dups != 1 || cp != v) cp = -20;
  if (cp == -20)
  {
    lowest_metric = UINT64_MAX;
    for (dups = 0, v = -1, i = n.cycleS; i < n.cycleE; ++i)
    {
      mp = i == n.cycleS ? c.match[c.cycleE - 1] : mc;
      mc = n.match[i];
      if (checkMatchDup(mp, mc)) { ++dups; cn = i; }
      if (n.diffMetricsU[i] < lowest_metric) { v = i; lowest_metric = n.diffMetricsU[i]; }
    }
    if (dups != 1 || cn != v) return false;
  }
  if (saved != cp && saved != cn) return false;
//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  obvious dec frame found  %d - %d!\n", saved, c.frameSO);
//    OutputDebugString(buf);
//  }
  c.decimate[saved] = c.decimate2[saved] = 1;
  c.decSet = true;
  return true;
}

/*
**  This function checks to see if there is a single frame in the
**  current cycle marked as a d2v duplicate and if that frame is
**  either a match dup or has the lowest metric.  If so, that frame
**  is chosen for decimation.  If mode=1 (or vfrDec=1), the prev and
**  next cycles must have a d2v duplicate in the same position.
**
**  This function is used by both longest string and most similar.
*/
int TDecimate::checkForD2VDecFrame(Cycle &p, Cycle &c, Cycle &n)
{
  int i, v = 0, mp, savedV = -20, savedL = -20, savedM = -20;
  uint64_t lowest = UINT64_MAX;
  for (i = c.cycleS; i < c.cycleE; ++i)
  {
    if (c.filmd2v[i] == 1 && (mode == 0 || (mode > 1 && vfrDec == 0) ||
      (p.filmd2v[i] == 1 && n.filmd2v[i] == 1)))
    {
      ++v;
      if (v > 1) return -20;
      savedV = i;
      mp = i == c.cycleS ? (p.cycleE > 0 ? p.match[p.cycleE - 1] : -20) : c.match[i - 1];
      if (checkMatchDup(mp, c.match[i])) savedM = i;
    }
    if (c.diffMetricsU[i] < lowest)
    {
      lowest = c.diffMetricsU[i];
      savedL = i;
    }
  }
  if (v != 1 || (savedV != savedM && savedV != savedL)) return -20;
//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  d2v dec frame found  %d - %d!\n", savedV, c.frameSO);
//    OutputDebugString(buf);
//  }
  return savedV;
}

/*  This function checks for the two duplicate due to ivtc pattern
**  change case. It checks the prev and next cycles for single match
**  duplicate with the lowest metric in different positions and the
**  current cycle for 2 match duplicates with the 2 lowest metrics.
**  The two dups must match the positions of dups in the prev and
**  next cycles.  Finally, it requires that no other frames in the
**  current cycle were detected as metric duplicates.
**
**  This function is only used by longest string.
*/
bool TDecimate::checkForTwoDropLongestString(Cycle &p, Cycle &c, Cycle &n)
{
  int dupsP = 0, savedp = -20, dupsN = 0, savedn = -20;
  int c1 = -20, c2 = -20, i, v, mp, mc = ISC;
  uint64_t lowest = UINT64_MAX;
  for (v = -1, i = p.cycleS; i < p.cycleE; ++i)
  {
    mp = i == p.cycleS ? -20 : mc;
    mc = p.match[i];
    if (checkMatchDup(mp, mc)) { ++dupsP; savedp = i; }
    if (p.diffMetricsU[i] < lowest) { lowest = p.diffMetricsU[i]; v = i; }
  }
  if (dupsP == 0 && v == p.cycleS) { dupsP = 1; savedp = v; }
  if (dupsP != 1 || savedp != v) return false;
  lowest = UINT64_MAX;
  for (v = -1, i = n.cycleS; i < n.cycleE; ++i)
  {
    mp = i == n.cycleS ? c.match[c.cycleE - 1] : mc;
    mc = n.match[i];
    if (checkMatchDup(mp, mc)) { ++dupsN; savedn = i; }
    if (n.diffMetricsU[i] < lowest) { lowest = n.diffMetricsU[i]; v = i; }
  }
  if (dupsN != 1 || savedn != v || savedn == savedp) return false;
  uint64_t lowest1 = UINT64_MAX;
  uint64_t lowest2 = UINT64_MAX;
  int cl1 = -20, cl2 = -20;
  for (v = 0, c1 = -1, c2 = -1, i = c.cycleS; i < c.cycleE; ++i)
  {
    mp = i == c.cycleS ? (p.cycleE > 0 ? p.match[p.cycleE - 1] : -20) : mc;
    mc = c.match[i];
    if (checkMatchDup(mp, mc))
    {
      ++v;
      if (c1 == -1) c1 = i;
      else if (c2 == -1) c2 = i;
    }
    if (c.diffMetricsU[i] < lowest1)
    {
      lowest2 = lowest1;
      cl2 = cl1;
      lowest1 = c.diffMetricsU[i];
      cl1 = i;
    }
    else if (c.diffMetricsU[i] < lowest2)
    {
      lowest2 = c.diffMetricsU[i];
      cl2 = i;
    }
  }
  if (v != 2 || c1 == -1 || c2 == -1 ||
    (c1 != cl1 && c1 != cl2) || (c2 != cl1 && c2 != cl2))
    return false;
  if (c1 != savedp || c2 != savedn) return false;
  if (abs(c1 - c2) <= 1) return false;
  for (i = c.cycleS; i < c.cycleE; ++i)
  {
    if (c.dupArray[i] == 1 && c1 != i && c2 != i)
      return false;
  }
  if ((c1 == c.cycleS && (p.dupArray[p.cycleE > 0 ? p.cycleE - 1 : 0] == 1 || c.dupArray[c1 + 1] == 1)) ||
    (c1 != c.cycleS && (c.dupArray[c1 - 1] == 1 || c.dupArray[c1 + 1] == 1))) return false;
  if ((c2 == c.cycleE - 1 && (n.dupArray[n.cycleS] == 1 || c.dupArray[c2 - 1] == 1)) ||
    (c2 != c.cycleE - 1 && (c.dupArray[c2 - 1] == 1 || c.dupArray[c2 + 1] == 1))) return false;
  if (hybrid == 0 && noblend)
  {
    if (c.diffMetricsU[c1] <= c.diffMetricsU[c2])
      c.decimate[c1] = c.decimate2[c1] = 1;
    else
      c.decimate[c2] = c.decimate2[c2] = 1;
    c.decSet = true;
  }
  else
  {
    c.blend = 3;
    c.decimate[c1] = c.decimate2[c1] = 1;
    c.decimate[c2] = c.decimate2[c2] = 1;
    c.decSet = true;
//    if (debug)
//    {
//      sprintf(buf, "TDecimate:  drop two frames longest string  %d:%d - %d!\n", c1, c2, c.frameSO);
//      OutputDebugString(buf);
//    }
  }
  return true;
}

void TDecimate::mostSimilarDecDecision(Cycle &p, Cycle &c, Cycle &n)
{
  if (!c.lowSet) c.setLowest(false);
  if (cycleR != 1)
  {
    c.setDecimateLow(c.frameEO - c.frameSO == cycle ? cycleR :
      std::max(int(cycleR*(c.frameEO - c.frameSO) / double(cycle)), 1));
    return;
  }
  if (!p.dupsSet) p.setDupsMatches(p, ovrArray);
  if (!c.dupsSet) c.setDupsMatches(p, ovrArray);
  if (!n.dupsSet) n.setDupsMatches(c, ovrArray);
  int d2vdecf = checkForD2VDecFrame(p, c, n);
  if (c.dupCount <= 0 && d2vdecf == -20)
  {
    c.setDecimateLow(cycleR);
    return;
  }
  int i, ovrdups;
  for (ovrdups = 0, i = c.cycleS; i < c.cycleE; ++i)
  {
    if (c.decimate[i] != 1) c.decimate[i] = c.decimate2[i] = 0;
    else ++ovrdups;
  }
  if (ovrdups != 0)
  {
    c.setDecimateLow(cycleR);
    return;
  }
  for (i = 0; i < c.cycleS; ++i) c.decimate[i] = c.decimate2[i] = -20;
  for (i = c.cycleE; i < c.length; ++i) c.decimate[i] = c.decimate2[i] = -20;
  if (d2vdecf != -20)
  {
    c.decimate[d2vdecf] = c.decimate2[d2vdecf] = 1;
    c.decSet = true;
    return;
  }
  if (c.dupCount == 1)
  {
    uint64_t lowest = UINT64_MAX, lowest2 = UINT64_MAX;
    int savedc = -1, savedp = -1, savedn = -1, v;
    for (v = -1, i = c.cycleS; i < c.cycleE; ++i)
    {
      if (c.dupArray[i] == 1) savedc = i;
      if (c.diffMetricsU[i] < lowest) { v = i; lowest = c.diffMetricsU[i]; }
    }
    if (savedc == v || (double)(c.diffMetricsU[savedc] * 0.9) <= (double)lowest ||
      fabs(((double)lowest*100.0 / (double)MAX_DIFF) - c.diffMetricsN[savedc]) < 0.20)
    {
      c.decimate[savedc] = c.decimate2[savedc] = 1;
      c.decSet = true;
      return;
    }
    if (p.dupCount == 1 && n.dupCount == 1)
    {
      for (lowest2 = UINT64_MAX, i = p.cycleS; i < p.cycleE; ++i)
      {
        if (p.dupArray[i] == 1) savedp = i;
        if (p.diffMetricsU[i] < lowest2) { v = i; lowest2 = p.diffMetricsU[i]; }
      }
      if (savedp != v)
      {
        c.setDecimateLow(cycleR);
        return;
      }
      for (lowest2 = UINT64_MAX, i = n.cycleS; i < n.cycleE; ++i)
      {
        if (n.dupArray[i] == 1) savedn = i;
        if (n.diffMetricsU[i] < lowest2) { v = i; lowest2 = n.diffMetricsU[i]; }
      }
      if (savedn != v || savedp != savedn || savedp != savedc)
      {
        c.setDecimateLow(cycleR);
        return;
      }
      c.decimate[savedc] = c.decimate2[savedc] = 1;
      c.decSet = true;
      return;
    }
    c.setDecimateLow(cycleR);
    return;
  }
  else
  {
    uint64_t lowestp = UINT64_MAX, lowestn = UINT64_MAX;
    int savedp = -1, savedn = -1, savedc1, savedc2, v;
    if (c.dupCount == 2 && p.dupCount == 1 && n.dupCount == 1)
    {
      for (v = -1, i = p.cycleS; i < p.cycleE; ++i)
      {
        if (p.dupArray[i] == 1) savedp = i;
        if (p.diffMetricsU[i] < lowestp) { v = i; lowestp = p.diffMetricsU[i]; }
      }
      if (savedp != v || v == -1) goto tryother;
      for (v = -1, i = n.cycleS; i < n.cycleE; ++i)
      {
        if (n.dupArray[i] == 1) savedn = i;
        if (n.diffMetricsU[i] < lowestn) { v = i; lowestn = n.diffMetricsU[i]; }
      }
      if (savedn != v || v == -1 || savedn == savedp) goto tryother;
      for (savedc1 = -1, savedc2 = -1, i = c.cycleS; i < c.cycleE; ++i)
      {
        if (c.dupArray[i] == 1)
        {
          if (savedc1 == -1) savedc1 = i;
          else if (savedc2 == -1) savedc2 = i;
        }
      }
      if (savedc1 != savedp || savedc2 != savedn) goto tryother;
      if (savedc1 != c.lowest[0] && savedc1 != c.lowest[1]) goto tryother;
      if (savedc2 != c.lowest[0] && savedc2 != c.lowest[1]) goto tryother;
      if (abs(savedc1 - savedc2) <= 1) goto tryother;
      if (hybrid == 0 && noblend)
      {
        if (c.diffMetricsU[savedc1] <= c.diffMetricsU[savedc2])
          c.decimate[savedc1] = c.decimate2[savedc1] = 1;
        else
          c.decimate[savedc2] = c.decimate2[savedc2] = 1;
        c.decSet = true;
      }
      else
      {
        c.blend = 3;
        c.decimate[savedc1] = c.decimate2[savedc1] = 1;
        c.decimate[savedc2] = c.decimate2[savedc2] = 1;
        c.decSet = true;
//        if (debug)
//        {
//          sprintf(buf, "TDecimate:  drop two frames most similar  %d:%d - %d!\n", savedc1, savedc2, c.frameSO);
//          OutputDebugString(buf);
//        }
      }
      return;
    }
  tryother:
    int savedc = -1;
    uint64_t metricP, metricN, metricPt, metricNt;
    for (v = 0, i = c.cycleS; i < c.cycleE; ++i)
    {
      if (c.dupArray[i] == 1)
      {
        if (((i == c.cycleS && p.dupArray[p.cycleE > 0 ? p.cycleE - 1 : 0] == 0) || (i != c.cycleS && c.dupArray[i - 1] == 0)) &&
          ((i == c.cycleE - 1 && n.dupArray[n.cycleS] == 0) || (i != c.cycleE - 1 && c.dupArray[i + 1] == 0)))
        {
          ++v;
          metricPt = i == c.cycleS ? p.diffMetricsU[p.cycleE > 0 ? p.cycleE - 1 : 0] : c.diffMetricsU[i - 1];
          metricNt = i == c.cycleE - 1 ? n.diffMetricsU[n.cycleS] : c.diffMetricsU[i + 1];
          if (savedc == -1 || (metricPt + metricNt > metricP + metricN &&
            metricPt > c.diffMetricsU[i] && metricNt > c.diffMetricsU[i] &&
            fabs(c.diffMetricsN[i] - c.diffMetricsN[c.lowest[0]]) < 1.0))
          {
            savedc = i;
            metricP = metricPt;
            metricN = metricNt;
          }
        }
      }
    }
    bool check = false;
    for (i = 0; i < v; ++i)
    {
      if (savedc == c.lowest[i]) { check = true; break; }
    }
    if (!check || savedc == -1)
    {
      c.setDecimateLow(cycleR);
      return;
    }
    c.decimate[savedc] = c.decimate2[savedc] = 1;
    c.decSet = true;
    return;
  }
  c.setDecimateLow(cycleR);
}

void TDecimate::findDupStrings(Cycle &p, Cycle &c, Cycle &n)
{
  if (!p.dupsSet) p.setDups(dupThresh);
  if (!c.dupsSet) c.setDups(dupThresh);
  if (!n.dupsSet) n.setDups(dupThresh);
  const int dcnt = (cycle + 1) >> 1;
  uint64_t lowest;
  int temp, i, g, b, f, forward, back, v, w = 0, j;
  int temp1, temp2, temp3, y, dups, ovrdups = 0, d2vdecf = -20;
  if (cycleR == 1) d2vdecf = checkForD2VDecFrame(p, c, n);
  for (temp = 0, i = c.cycleS; i < c.cycleE; ++i) temp += c.dupArray[i];
  if (temp == 0 && d2vdecf == -20)
  {
    if (!c.lowSet) c.setLowest(false);
    c.setDecimateLow(c.frameEO - c.frameSO == cycle ? cycleR :
      std::max(int(cycleR*(c.frameEO - c.frameSO) / double(cycle)), 1));
    return;
  }
  for (ovrdups = 0, i = c.cycleS; i < c.cycleE; ++i)
  {
    if (c.decimate[i] != 1) c.decimate[i] = c.decimate2[i] = 0;
    else ++ovrdups;
  }
  for (i = 0; i < c.cycleS; ++i) c.decimate[i] = c.decimate2[i] = -20;
  for (i = c.cycleE; i < c.length; ++i) c.decimate[i] = c.decimate2[i] = -20;
  int cycleRt = c.frameEO - c.frameSO == cycle ? cycleR :
    std::max(int(cycleR*(c.frameEO - c.frameSO) / double(cycle)), 1);
  if (ovrdups >= cycleRt) { c.decSet = true; return; }
  if (cycleR == 1 && checkForObviousDecFrame(p, c, n)) return;
  if (cycleR == 1 && checkForTwoDropLongestString(p, c, n)) return;
  if (cycleR == 1 && cycle > 2 && ovrdups == 0 && c.dupCount > 1 &&
    (p.dupCount == 1 || p.dupCount == 0 || n.dupCount == 1 || n.dupCount == 0))
  {
    int p1 = -20, c1 = -20, c2 = -20, n1 = -20, dupcp = 0, usecp = 0;
    for (dupcp = 0, i = c.cycleS; i < c.cycleE && dupcp == 0; ++i)
    {
      if (c.dupArray[i] == 1 &&
        ((i > c.cycleS && c.dupArray[i - 1] == 0) || (i == c.cycleS && p.dupArray[p.cycleE > 0 ? p.cycleE - 1 : 0] == 0)) &&
        ((i < c.cycleE - 1 && c.dupArray[i + 1] == 0) || ((i == c.cycleE - 1 && n.dupArray[n.cycleS] == 0))))
      {
        c1 = i; break;
      }
      if (c.dupArray[i] == 1) ++dupcp;
    }
    for (dupcp = 0, i = c.cycleE - 1; i >= c.cycleS && dupcp == 0; --i)
    {
      if (c.dupArray[i] == 1 &&
        ((i > c.cycleS && c.dupArray[i - 1] == 0) || (i == c.cycleS && p.dupArray[p.cycleE > 0 ? p.cycleE - 1 : 0] == 0)) &&
        ((i < c.cycleE - 1 && c.dupArray[i + 1] == 0) || ((i == c.cycleE - 1 && n.dupArray[n.cycleS] == 0))))
      {
        c2 = i; break;
      }
      if (c.dupArray[i] == 1) ++dupcp;
    }
    bool ct1 = false, ct2 = false;
    if ((p.dupCount == 1 || p.dupCount == 0) && c1 != -20)
    {
      if (p.dupCount == 0 && (p.cycleE - p.cycleS == p.length ||
        n.dupCount != c.dupCount)) {
        p1 = c1; ct1 = true;
      }
      else
      {
        for (i = p.cycleS; i < p.cycleE; ++i)
        {
          if (p.dupArray[i] == 1) { p1 = i; break; }
        }
      }
      if (p1 == c1) usecp += 1;
    }
    if ((n.dupCount == 1 || n.dupCount == 0) && c2 != -20)
    {
      if (n.dupCount == 0 && (n.cycleE - n.cycleS == n.length ||
        p.dupCount != c.dupCount)) {
        n1 = c2; ct2 = true;
      }
      else
      {
        for (i = n.cycleS; i < n.cycleE; ++i)
        {
          if (n.dupArray[i] == 1) { n1 = i; break; }
        }
      }
      if (n1 == c2) usecp += 5;
    }
    if (hybrid == 0 && noblend && usecp == 6)
    {
      if (ct1 && !ct2) usecp = 5;
      else if (!ct1 && ct2) usecp = 1;
      else
      {
        if (c.diffMetricsU[c1] <= c.diffMetricsU[c2]) usecp = 1;
        else usecp = 5;
      }
    }
    if (usecp == 1 || usecp == 5)
    {
      if (usecp == 5) c1 = c2;
      c.decimate[c1] = c.decimate2[c1] = 1;
//      if (debug)
//      {
//        sprintf(buf, "TDecimate:  usecp case %d - %d!\n", usecp, c.frameSO);
//        OutputDebugString(buf);
//      }
      c.decSet = true;
      return;
    }
    else if (usecp == 6)
    {
      c.blend = 3;
      c.decimate[c1] = c.decimate2[c1] = 1;
      c.decimate[c2] = c.decimate2[c2] = 1;
//      if (debug)
//      {
//        sprintf(buf, "TDecimate:  usecp case %d - %d!\n", usecp, c.frameSO);
//        OutputDebugString(buf);
//      }
      c.decSet = true;
      return;
    }
  }
  if (d2vdecf != -20)
  {
    c.decimate[d2vdecf] = c.decimate2[d2vdecf] = 1;
    c.decSet = true;
    return;
  }
  int **dupStrings = (int**)malloc(dcnt * sizeof(int*));
  for (int z = 0; z < dcnt; ++z)
    dupStrings[z] = (int*)malloc(3 * sizeof(int));
  for (i = 0; i < dcnt; ++i)
    dupStrings[i][0] = dupStrings[i][1] = dupStrings[i][2] = -20;
  for (w = 0, i = c.cycleS; i < c.cycleE; ++i)
  {
    if (c.dupArray[i] == 0) continue;
    f = b = i;
    forward = back = 0;
    while (c.dupArray[f] == 1 && f < c.cycleE)
    {
      ++forward;
      ++f;
    }
    if (f == c.cycleE)
    {
      g = n.cycleS;
      while (n.dupArray[g] == 1 && g < n.cycleE)
      {
        ++g;
        ++forward;
      }
    }
    while (c.dupArray[b] == 1 && b >= c.cycleS)
    {
      ++back;
      --b;
    }
    if (b < 0)
    {
      g = p.cycleE - 1;
      while ((p.dupArray[g] == 1 && p.decimate2[g] != 1) && g >= p.cycleS)
      {
        ++back;
        --g;
      }
    }
    i = f;
    ++b;
    dupStrings[w][0] = back + forward - 1;
    dupStrings[w][1] = b;
    dupStrings[w][2] = f;
    ++w;
  }
  if (ovrArray.size())
  {
    for (i = c.cycleS; i < c.cycleE; ++i)
    {
      if (c.decimate[i] == 1)
      {
        for (v = 0; v < w; ++v)
        {
          if (i >= dupStrings[v][1] && i < dupStrings[v][2])
          {
            if (dupStrings[v][2] - dupStrings[v][1] - 1 <= 0) dupStrings[v][0] = -20;
            else --dupStrings[v][0];
          }
        }
      }
    }
  }
  for (i = 1; i < w; ++i)
  {
    j = i;
    temp1 = dupStrings[i][0];
    temp2 = dupStrings[i][1];
    temp3 = dupStrings[i][2];
    while (j > 0 && (dupStrings[j - 1][0] < temp1 || (dupStrings[j - 1][0] == temp1 &&
      dupStrings[j - 1][1] > temp2)))
    {
      dupStrings[j][0] = dupStrings[j - 1][0];
      dupStrings[j][1] = dupStrings[j - 1][1];
      dupStrings[j][2] = dupStrings[j - 1][2];
      --j;
    }
    dupStrings[j][0] = temp1;
    dupStrings[j][1] = temp2;
    dupStrings[j][2] = temp3;
  }
  for (v = 0; v < c.dupCount && v < cycleRt - ovrdups; ++v)
  {
    if (dupStrings[0][0] < 1) break;
    lowest = UINT64_MAX;
    f = dupStrings[0][1];
    for (dups = 0, i = dupStrings[0][1]; i < dupStrings[0][2]; ++i)
    {
      if (c.diffMetricsU[i] < lowest && c.decimate[i] == 0)
      {
        lowest = c.diffMetricsU[i];
        f = i;
      }
      if (c.decimate[i] == 1) ++dups;
    }
    c.decimate[f] = 1;
    y = dupStrings[0][1];
    while (c.decimate2[y] == 1) ++y;
    c.decimate2[y] = 1;
    if (dupStrings[0][2] - dupStrings[0][1] - dups - 1 <= 0) dupStrings[0][0] = -20;
    else --dupStrings[0][0];
    j = 0;
    temp1 = dupStrings[0][0];
    temp2 = dupStrings[0][1];
    temp3 = dupStrings[0][2];
    while (j < w - 1 && (dupStrings[j + 1][0] > temp1 || (dupStrings[j + 1][0] == temp1 &&
      dupStrings[j + 1][1] < temp2)))
    {
      dupStrings[j][0] = dupStrings[j + 1][0];
      dupStrings[j][1] = dupStrings[j + 1][1];
      dupStrings[j][2] = dupStrings[j + 1][2];
      ++j;
    }
    dupStrings[j][0] = temp1;
    dupStrings[j][1] = temp2;
    dupStrings[j][2] = temp3;
  }
  c.decSet = true;
  if (v < cycleRt - ovrdups)
  {
    c.setLowest(true);
    c.setDecimateLowP(cycleRt - ovrdups - v);
  }
  for (int z = 0; z < dcnt; ++z)
    free(dupStrings[z]);
  free(dupStrings);
}

void TDecimate::checkVideoMatches(Cycle &p, Cycle &c)
{
  if (!p.mSet || !c.mSet || (c.type != 3 && c.type > 0)) return;
  int dups = 0, mp, mc, i;
  for (i = c.cycleS; i < c.cycleE && dups <= 0; ++i)
  {
    if (i == c.cycleS)
    {
      if (p.frame != c.frame) mp = p.cycleE > 0 ? p.match[p.cycleE - 1] : -20;
      else mp = -20;
    }
    else mp = c.match[i - 1];
    mc = c.match[i];
    if (mp == 0 && mc == 3) ++dups;
    else if (mp == 1 && (mc == 0 || mc == 3)) ++dups;
    else if (mp == 2 && (mc == 1 || mc == 3 || mc == 4 || mc == 6)) ++dups;
    else if (mp == 3 && mc == 0) ++dups;
    else if (mp == 4 && (mc == 0 || mc == 1 || mc == 2 || mc == 5)) ++dups;
    else if (mp == 5 && mc == 3) ++dups;
    else if (mp == 6 && mc == 0) ++dups;
    else if (mc < 0) ++dups;
  }
  if (dups == 0)
  {
    if (c.type == -1) c.type = 2;
    else if (c.type == 0) c.type = 2;
    else if (c.type == 3) c.type = 4;
  }
  else if (c.type == -1) c.type = 0;
}

bool TDecimate::checkMatchDup(int mp, int mc)
{
  if (mp == 0 && mc == 3) return true;
  else if (mp == 1 && (mc == 0 || mc == 3)) return true;
  else if (mp == 2 && (mc == 1 || mc == 3 || mc == 4 || mc == 6)) return true;
  else if (mp == 3 && mc == 0) return true;
  else if (mp == 4 && (mc == 0 || mc == 1 || mc == 2 || mc == 5)) return true;
  else if (mp == 5 && mc == 3) return true;
  else if (mp == 6 && mc == 0) return true;
  else if (mc < 0) return true;
  return false;
}

void TDecimate::checkVideoMetrics(Cycle &c, double thresh)
{
  if (!c.mSet || (c.type > 0 && c.type != 2)) return;
  int dups = 0, f = c.cycleS, i;
  double min = 999999.0, max = -999999.0, temp;
  if (c.frame == 0) ++f;
  for (i = f; i < c.cycleE; ++i)
  {
    temp = c.diffMetricsN[i];
    if (temp <= thresh) ++dups;
    if (temp < min) min = temp;
    if (temp > max) max = temp;
  }
  if (min == 0.0) min = 0.0001;
  if (dups == 0 || (cve && max / min < 1.6 && max - min < 2.0 && max >= 0.3))
  {
    if (c.type == -1) c.type = 3;
    else if (c.type == 0) c.type = 3;
    else if (c.type == 2) c.type = 4;
  }
  else if (c.type == -1) c.type = 0;
}

// PF 180131 uses usehints!
void TDecimate::getOvrCycle(Cycle &current, bool mode2)
{
  if (mode2) current.dupCount = 0;
  if (ovrArray.empty() && metricsArray.empty() && metricsOutArray.empty()) return;
  int b = current.cycleS, v = 0, i, p = 0, d = 0, value;
  int numr = current.frameEO - current.frameSO == cycle ? cycleR :
    std::max(int(cycleR*(current.frameEO - current.frameSO) / double(cycle)), 1);
  for (i = current.frameSO; i < current.frameEO; ++i, ++b)
  {
    if (ovrArray.size())
    {
      value = ovrArray[i];
      if (value&DROP_FRAME && (d < numr || mode2))
      {
        if (mode2)
        {
          current.dupArray[b] = 1;
          ++current.dupCount;
        }
        else
        {
          current.decimate[b] = current.decimate2[b] = 1;
          ++d;
          current.type = 1;
        }
      }
      if (value&VIDEO) ++v;
      if (value&FILM) ++p;
      if (usehints)
      {
        if (value&ISD2VFILM) current.filmd2v[b] = 1;
        if ((value&ISMATCH) != 0x70)
        {
          value = (value&ISMATCH) >> 4;
          if (value == ISC) current.match[b] = ISC;
          else if (value == ISP) current.match[b] = ISP;
          else if (value == ISN) current.match[b] = ISN;
          else if (value == ISB) current.match[b] = ISB;
          else if (value == ISU) current.match[b] = ISU;
          else if (value == ISDB) current.match[b] = ISDB;
          else if (value == ISDT) current.match[b] = ISDT;
        }
      }
    }
    bool foundM = false;
    if (metricsArray.size())
    {
      if (metricsArray[i << 1] != UINT64_MAX)
      {
        current.diffMetricsU[b] = metricsArray[i << 1];
        current.diffMetricsN[b] = (metricsArray[i << 1] * 100.0) / MAX_DIFF;
        foundM = true;
      }
      if (metricsArray[(i << 1) + 1] != UINT64_MAX)
        current.diffMetricsUF[b] = metricsArray[(i << 1) + 1];
    }
    if (metricsOutArray.size() && !foundM)
    {
      if (metricsOutArray[i << 1] != UINT64_MAX)
      {
        current.diffMetricsU[b] = metricsOutArray[i << 1];
        current.diffMetricsN[b] = (metricsOutArray[i << 1] * 100.0) / MAX_DIFF;
      }
      if (metricsOutArray[(i << 1) + 1] != UINT64_MAX)
        current.diffMetricsUF[b] = metricsOutArray[(i << 1) + 1];
    }
  }
  if (v > 0 && v == current.cycleE - current.cycleS && current.type != 1)
    current.type = 5;
  if (p > 0 && p == current.cycleE - current.cycleS && current.type != 5)
    current.type = 1;
  current.setIsFilmD2V();
}

void TDecimate::getOvrFrame(int n, uint64_t &metricU, uint64_t &metricF) const
{
  metricU = metricF = UINT64_MAX;
  if (metricsArray.size())
  {
    if (metricsArray[n << 1] != UINT64_MAX)
      metricU = metricsArray[n << 1];
    if (metricsArray[(n << 1) + 1] != UINT64_MAX)
      metricF = metricsArray[(n << 1) + 1];
  }
  
  if (metricU != UINT64_MAX && metricF != UINT64_MAX)
    return;

  if (metricsOutArray.size())
  {
    if (metricU == UINT64_MAX && metricsOutArray[n << 1] != UINT64_MAX) 
      metricU = metricsOutArray[n << 1];
    if (metricF == UINT64_MAX && metricsOutArray[(n << 1) + 1] != UINT64_MAX)
      metricF = metricsOutArray[(n << 1) + 1];
  }
}

void TDecimate::calcBlendRatios(double &amount1, double &amount2, int &frame1, int &frame2, int n,
  int bframe, int cycleI)
{
  double stepsize = ((double)cycleI) / ((double)(cycleI - cycleR));
  double offset = ((cycleI - 1) - (stepsize*(cycleI - cycleR - 1)))*0.5;
  double pos = bframe + (n % (cycle - cycleR))*stepsize + offset;
  double posf = pos - (int)(pos);
  frame1 = (int)(pos);
  frame2 = (int)(pos + 1.0);
  amount1 = 1.0 - posf;
  amount2 = posf;
}

void TDecimate::calcBlendRatios2(double &amount1, double &amount2, int &frame1, int &frame2, int tf,
  Cycle &p, Cycle &c, Cycle &n, int remove)
{
  int i, b, k;
  int cycleI = c.cycleE - c.cycleS;
  int cycleD = cycleI - remove;
  int *lutf = (int *)malloc((cycleI + 2) * sizeof(int));
  for (i = 0; i < cycleI + 2; ++i) lutf[i] = -20;
  double stepsize = ((double)cycleD) / ((double)(cycleI));
  double offset = (cycleI - 1)*stepsize;
  offset = (offset - int(offset))*0.5;
  double pos = 1 + (tf%p.length)*stepsize - offset;
  double posf = pos - (int)(pos);
  for (b = p.frameEO - 1, i = p.cycleE - 1; i >= p.cycleS; --i, --b)
  {
    if (p.decimate[i] != 1)
    {
      lutf[0] = b;
      break;
    }
  }
  for (k = 1, b = c.frameSO, i = c.cycleS; i < c.cycleE; ++i, ++b)
  {
    if (c.decimate[i] != 1)
    {
      lutf[k] = b;
      ++k;
    }
  }
  for (b = n.frameSO, i = c.cycleS; i < c.cycleE; ++i, ++b)
  {
    if (c.decimate[i] != 1)
    {
      lutf[k] = b;
      break;
    }
  }
  for (i = 0; i < cycleI + 2; ++i)
  {
    if (lutf[i] < 0) lutf[i] = 0;
    else if (lutf[i] > nfrms) lutf[i] = nfrms;
  }
  frame1 = lutf[(int)(pos)];
  frame2 = lutf[(int)(pos + 1.0)];
  amount1 = 1.0 - posf;
  amount2 = posf;
  // amount 1 and 2 sum is always 1.0, some routines know this and use only amount1
  free(lutf);
}

// used in GetFrameMode01
// hbd ready
void TDecimate::blendFrames(const VSFrameRef *src1, const VSFrameRef *src2, VSFrameRef *dst,
  double amount1)
{
  const uint8_t *srcp1, *srcp2;
  uint8_t *dstp;
  int width, height;
  int s1_pitch, dst_pitch, s2_pitch;

  const float weight_f = (float)amount1;

  // 15 bit arithmetic (used as 16 bi at 8 bit case)
  const int weight_i = (int)(weight_f * 32768.0f + 0.5f);

  if (weight_i >= 32768)
  {
    copyFrame(dst, src1, vsapi); // 1000% src1
    return;
  }
  if (weight_i <= 0)
  {
    copyFrame(dst, src2, vsapi); // 100% src2
    return;
  }

  const VSFormat *format = vsapi->getFrameFormat(dst);

  const int np = format->numPlanes;
  const int bits_per_pixel = format->bitsPerSample;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    srcp1 = vsapi->getReadPtr(src1, plane);
    s1_pitch = vsapi->getStride(src1, plane);
    width = vsapi->getFrameWidth(src1, plane);
    height = vsapi->getFrameHeight(src1, plane);
    srcp2 = vsapi->getReadPtr(src2, plane);
    s2_pitch = vsapi->getStride(src2, plane);
    dstp = vsapi->getWritePtr(dst, plane);
    dst_pitch = vsapi->getStride(dst, plane);

    dispatch_blend(dstp, srcp1, srcp2, width, height, dst_pitch, s1_pitch, s2_pitch, weight_i, bits_per_pixel, &cpuFlags);
  }
}


/*
** I've copied the following functions:  float_to_frac, reduce_float,
** and FloatToFPS from fps.cpp, which can be obtained from avisynth's
** cvs (avisynth2.cvs.sourceforge.net).
*/

static bool float_to_frac(float input, unsigned &num, unsigned &den)
{
  union { float f; unsigned i; } value;
  unsigned mantissa;
  int exponent;
  value.f = input;
  mantissa = (value.i & 0x7FFFFF) + 0x800000;  // add implicit bit on the left
  exponent = ((value.i & 0x7F800000) >> 23) - 127 - 23;  // remove decimal pt
  while (!(mantissa & 1)) { mantissa >>= 1; exponent += 1; }
  if (exponent < -31) {
    return float_to_frac(float(1.0 / input), den, num);
  }
  while ((exponent > 0) && !(mantissa & 0x80000000)) {
    mantissa <<= 1; exponent -= 1;
  }
  if (exponent > 0) {  // number too large
    num = 0xffffffff;
    den = 1;
    return true; // Out of range!
  }
  num = mantissa;
  den = 1 << (-exponent);
  return false;
}

static bool reduce_float(float value, unsigned &num, unsigned &den)
{
  if (float_to_frac(value, num, den)) return true;
  unsigned n0 = 0, n1 = 1, n2, nx = num;  // numerators
  unsigned d0 = 1, d1 = 0, d2, dx = den;  // denominators
  unsigned a2, ax, amin;  // integer parts of quotients
  unsigned f1 = 0, f2;    // fractional parts of quotients
  while (1)  // calculate convergents
  {
    a2 = nx / dx;
    f2 = nx % dx;
    n2 = n0 + n1 * a2;
    d2 = d0 + d1 * a2;
    if (f2 == 0) break;  // no more convergents (n2 / d2 == input)
    float f = (float)((double)n2 / d2);
    if (f == value) break;
    n0 = n1; n1 = n2;
    d0 = d1; d1 = d2;
    nx = dx; dx = f1 = f2;
  }
  if (d2 == 1)
  {
    num = n2;
    den = d2;
  }
  else { // we have been through the loop at least twice
    if ((a2 % 2 == 0) && (d0 * f1 > f2 * d1))
      amin = a2 / 2;  // passed 1/2 a_k admissibility test
    else
      amin = a2 / 2 + 1;
    union { float f; unsigned i; } eps; eps.f = value;

#ifndef UInt32x32To64
#define UInt32x32To64(a, b) ((uint64_t)(((uint64_t)((uint32_t)(a))) * ((uint32_t)(b))))
#endif
    if (UInt32x32To64(n1, den) > UInt32x32To64(num, d1))
      eps.i -= 1;
    else
      eps.i += 1;

    double r2 = eps.f;
    r2 += value;
    r2 /= 2;
    double yn = n0 - r2*d0;
    double yd = r2*d1 - n1;
    ax = (unsigned)((yn + yd) / yd); // ceiling value
    if (ax < amin) ax = amin;
    num = n0 + n1 * ax;
    den = d0 + d1 * ax;
  }
  return false;
}

static bool FloatToFPS(double n, unsigned &num, unsigned &den)
{
    /// check the rate in the caller
//  if (n <= 0)
//    env->ThrowError("TDecimate:  rate must be greater than 0.\n");
  float x;
  unsigned u = (unsigned)(n * 1001 + 0.5);
  x = float((u / 30000 * 30000) / 1001.0);
  if (x == (float)n) { num = u; den = 1001; return false; }
  x = float((u / 24000 * 24000) / 1001.0);
  if (x == (float)n) { num = u; den = 1001; return false; }
  if (n < 14.986) {
    u = (unsigned)(30000 / n + 0.5);
    x = float(30000.0 / (u / 1001 * 1001));
    if (x == (float)n) { num = 30000; den = u; return false; }
    u = (unsigned)(24000 / n + 0.5);
    x = float(24000.0 / (u / 1001 * 1001));
    if (x == (float)n) { num = 24000; den = u; return false; }
  }
  /// make the caller check the return value and throw the error if true
  return reduce_float(float(n), num, den);
//    env->ThrowError("TDecimate:  rate value is out of range.\n");
}



void TDecimate::init_mode_5(VSCore *core) {
  FILE *f = nullptr;

  mkvfps = (fps*(cycle - cycleR)) / cycle;
  mkvfps2 = (fps*(cycle - cycleR - 1)) / cycle;
  std::vector<int> input_magic_numbers(vi.numFrames, 0);

  Cycle prevM(5, sdlim), currM(5, sdlim), nextM(5, sdlim);
  if (cycle > 5)
  {
    prevM.setSize(cycle);
    currM.setSize(cycle);
    nextM.setSize(cycle);
  }
  prevM.length = currM.length = nextM.length = cycle;
  prevM.maxFrame = currM.maxFrame = nextM.maxFrame = nfrms;
  bool vid, prevVid;
  int i, h, w, firstkv, countprev, filmC, videoC, longestT, longestV, countVT;
  int count, b, passThrough = 0;
twopassrun:
  ++passThrough;
#if 0
  if ((f = tivtc_fopen("debug.txt", "a")) != nullptr) {
    fprintf(f, "passThrough=%d cycle=%d nfrms=%d vidThresh=%f np=%d\n", passThrough, cycle, nfrms, (float)vidThresh, np);
    fclose(f);
    f = nullptr;
  }
#endif
  count = 0;
  for (b = 0; b <= nfrms; b += cycle)
  {
    if (b == 0)
    {
      currM.setFrame(0);
      getOvrCycle(currM, false); // PF 180131 uses usehints!
      calcMetricCycle(currM, true, true, core);
      checkVideoMatches(currM, currM);
      checkVideoMetrics(currM, vidThresh);
    }
    else
    {
      prevM = currM;
      currM = nextM;
    }
    nextM.setFrame(b + cycle);
    getOvrCycle(nextM, false); // PF 180131 uses usehints!
    calcMetricCycle(nextM, true, true, core); // PF 180131 uses usehints!
    checkVideoMatches(currM, nextM);
    checkVideoMetrics(nextM, vidThresh);
    if (passThrough == 1)
    {
      if (currM.type == 5 || (!currM.isfilmd2v && ((currM.type == 2 && (vidDetect == 0 || vidDetect == 2)) ||
        (currM.type == 3 && (vidDetect == 1 || vidDetect == 2)) || (currM.type == 4 && vidDetect == 3))))
      {
        if (currM.type == 5) input_magic_numbers[b] = 8;
        if (currM.sceneDetect(prevM, nextM, sceneThreshU) != -20) input_magic_numbers[b] = 8;
      }
      else
      {
        if (vfrDec != 1)
        {
          mostSimilarDecDecision(prevM, currM, nextM);
        }
        else
        {
          prevM.setDups(dupThresh);
          currM.setDups(dupThresh);
          nextM.setDups(dupThresh);
          findDupStrings(prevM, currM, nextM);
        }
        for (w = 0, i = b; i < b + cycle && i <= nfrms; ++i, ++w)
        {
          if (currM.decimate[w] == 1) input_magic_numbers[i] = 2;
        }
      }
    } // passthrough == 1
    else
    { // passthrough != 1
      for (vid = true, i = b; i <= nfrms && i < b + cycle; ++i)
      {
        if (input_magic_numbers[i] == 2) vid = false;
      }
      if (!vid)
      {
        if (vfrDec != 1)
        {
          mostSimilarDecDecision(prevM, currM, nextM);
        }
        else
        {
          prevM.setDups(dupThresh);
          currM.setDups(dupThresh);
          nextM.setDups(dupThresh);
          findDupStrings(prevM, currM, nextM);
        }
        for (w = 0, i = b; i < b + cycle && i <= nfrms; ++i, ++w)
        {
          if (currM.decimate[w] == 1)
          {
            input_magic_numbers[i] = 2;
            ++count;
#if 0
            if ((f = tivtc_fopen("debug.txt", "a")) != nullptr) {
              fprintf(f, "count=%03d b=%d w=%d i=%d \n", count, b, w, i);
              fclose(f);
              f = nullptr;
            }
#endif
          }
          else input_magic_numbers[i] = 0;
        }
      }
      else
      {
        for (i = b; i < b + cycle && i <= nfrms; ++i) input_magic_numbers[i] = 0;
      }
    } // passthrough != 1
  }
  if (passThrough == 2) { goto finishTP; }
  for (w = 0, h = 0; h <= nfrms; h += cycle)
  {
    for (vid = true, i = h; i < h + cycle && i <= nfrms; ++i)
    {
      if (input_magic_numbers[i] == 2) vid = false;
    }
    if (vid) ++w;
    else
    {
      if (w > 0 && w < conCycleTP)
      {
        for (i = std::max(0, h - w * cycle); i < h && i <= nfrms; i += cycle)
        {
          if (input_magic_numbers[i] != 8) input_magic_numbers[i] = 2;
        }
      }
      w = 0;
    }
  }
  if (w > 0 && w < conCycleTP)
  {
    for (i = h - w * cycle; i < h && i <= nfrms; i += cycle)
    {
      if (input_magic_numbers[i] != 8) input_magic_numbers[i] = 2;
    }
  }
  goto twopassrun;
finishTP:
    metricsArray.resize(0);

  if (ovrArray.size())
  {
    ovrArray.resize(0);
  }

#if 0
  if ((f = tivtc_fopen("debug.txt", "a")) != nullptr) {
    fprintf(f, "new_num_frames=%d vi.numFrames=%d count=%d\n", vi.numFrames - count, vi.numFrames, count);
    fclose(f);
    f = nullptr;
  }
#endif

  int fpsNum = vi.fpsNum;
  int frameNum = vi.fpsDen;
  vi.fpsNum = 0;
  vi.fpsDen = 0;
  vi.numFrames = vi.numFrames - count;
  if ((f = tivtc_fopen(mkvOut.c_str(), "w")) != nullptr)
  {
    double timestamp = 0.0;
    double sample1 = 1000.0 / fps;
    double sample2 = 1000.0 / mkvfps;
    double sample3 = 1000.0 / mkvfps2;
    int ddup;
    if (tcfv1)
    {
      fprintf(f, "# timecode format v1\n");
      fprintf(f, "Assume %4.6f\n", fps);
    }
    else fprintf(f, "# timecode format v2\n");
    fprintf(f, "# TDecimate %s by tritical\n", VERSION);
    fprintf(f, "# Mode 5 - Auto-generated mkv timecodes file\n");
    firstkv = countprev = 0;
    vid = prevVid = true;
    filmC = videoC = longestT = longestV = countVT = 0;
    for (count = 0, b = 0; b <= nfrms; b += cycle)
    {
      prevVid = vid;
      countprev = count;
      vid = true;
      for (i = b, ddup = 0; i < b + cycle && i <= nfrms; ++i)
      {
        if (input_magic_numbers[i] == 2)
        {
          ++ddup;
          if (ddup < 2) filmC += (b + cycle <= nfrms ? cycle : nfrms - b + 1);
          vid = false;
        }
        else ++count;
      }
      
      int frameDen;
      switch (ddup)
      {
          case 1:
          frameDen = static_cast<int>(fpsNum * (cycle - cycleR) / cycle);
          break;
          case 2:
          frameDen = static_cast<int>(fpsNum * (cycle - cycleR - 1) / cycle);
          break;
          default:
          frameDen = fpsNum;
          break;
      }
      for (int frm = b; frm < b + cycle; frm++)
      {
        frame_duration_info[frm].first = frameNum;
        frame_duration_info[frm].second = frameDen;
      }

      if (vid)
      {
        if (!tcfv1)
        {
          int stop = (b + cycle <= nfrms ? cycle : nfrms - b + 1);
          for (int x = 0; x < stop; ++x)
          {
            fprintf(f, "%3.6f\n", timestamp);
            timestamp += sample1;
          }
        }
        videoC += (b + cycle <= nfrms ? cycle : nfrms - b + 1);
        longestT += (b + cycle <= nfrms ? cycle : nfrms - b + 1);
      }
      else if (!tcfv1)
      {
        if (ddup == 1)
        {
          int stop = (b + cycle <= nfrms ? cycle - cycleR : nfrms - b + 1 - cycleR);
          for (int x = 0; x < stop; ++x)
          {
            fprintf(f, "%3.6f\n", timestamp);
            timestamp += sample2;
          }
        }
        else if (ddup == 2)
        {
          int stop = (b + cycle <= nfrms ? cycle - cycleR - 1 : nfrms - b + 1 - cycleR - 1);
          for (int x = 0; x < stop; ++x)
          {
            fprintf(f, "%3.6f\n", timestamp);
            timestamp += sample3;
          }
        }
        else throw TIVTCError("TDecimate:  unknown mode 5 error (tc file creation)!");
      }
      else if (ddup == 2)
      {
        if (!prevVid) fprintf(f, "%d,%d,%4.6f\n", firstkv, countprev - 1, mkvfps);
        fprintf(f, "%d,%d,%4.6f\n", countprev, countprev + cycle - cycleR - 2, mkvfps2);
        firstkv = countprev + cycle - cycleR - 1;
      }
      if (prevVid != vid && countprev != 0 && ddup != 2 && countprev > firstkv)
      {
        if (!prevVid && tcfv1) fprintf(f, "%d,%d,%4.6f\n", firstkv, countprev - 1, mkvfps);
        firstkv = countprev;
      }
      else if (prevVid != vid && ddup != 2) firstkv = countprev;
      if (prevVid != vid && prevVid && countprev != 0)
      {
        if (longestT > longestV) longestV = longestT;
        ++countVT;
        longestT = 0;
      }
    }
    if (!vid && tcfv1) fprintf(f, "%d,%d,%4.6f\n", firstkv, count - 1, mkvfps);
    double filmCf = ((double)(filmC) / (double)(nfrms + 1))*100.0;
    double videoCf = ((double)(videoC) / (double)(nfrms + 1))*100.0;
    fprintf(f, "# vfr stats:  %05.2f%c film  %05.2f%c video\n", filmCf, '%', videoCf, '%');
    fprintf(f, "# vfr stats:  %d - film  %d - video  %d - total\n", filmC, videoC, nfrms + 1);
    fprintf(f, "# vfr stats:  longest vid section - %d frames\n", longestV);
    fprintf(f, "# vfr stats:  # of detected vid sections - %d", countVT);
    fclose(f);
    f = nullptr;
  }
  else
  {
    throw TIVTCError("TDecimate:  mkvOut file output error (cannot create file)!");
  }
  if (aLUT.size())
    aLUT.resize(0);

  aLUT.resize(vi.numFrames + 1, 0);

  i = w = 0;
  while (i <= nfrms && w <= vi.numFrames - 1)
  {
    if (input_magic_numbers[i] != 2)
    {
      aLUT[w] = i;
      ++w;
    }
    ++i;
  }
  input_magic_numbers.resize(0);
  nfrmsN = vi.numFrames - 1;

  if (f != nullptr) fclose(f);

  //nfrms and nfrmsN may give some hints as well.
  //8day
  if (orgOut.size())
  {
    if (aLUT.empty())
      throw TIVTCError("TDecimate: aLUT is nullptr!");
    FILE *orgOutF = tivtc_fopen(orgOut.c_str(), "w");
    if (orgOutF == nullptr)
      throw TIVTCError("TDecimate: cannot create orgOut file!");
    for (int n = 0; n<vi.numFrames; ++n)
    {
      fprintf(orgOutF, "%d\n", aLUT[n]);
    }
    fclose(orgOutF);
  }

} // init mode 5

TDecimate::TDecimate(VSNodeRef *_child, int _mode, int _cycleR, int _cycle, double _rate,
  double _dupThresh, double _vidThresh, double _sceneThresh, int _hybrid,
  int _vidDetect, int _conCycle, int _conCycleTP, const char* _ovr,
  const char* _output, const char* _input, const char* _tfmIn, const char* _mkvOut,
  int _nt, int _blockx, int _blocky, bool _debug, bool _display, int _vfrDec,
  bool _batch, bool _tcfv1, bool _se, bool _chroma, bool _exPP, int _maxndl, bool _m2PA,
  bool _predenoise, bool _noblend, bool _ssd, bool _usehints, VSNodeRef *_clip2,
  int _sdlim, int _opt, const char* _orgOut, const VSAPI *_vsapi, VSCore *core)
    : vsapi(_vsapi), child(_child),
  mode(_mode),
  cycleR(_cycleR), cycle(_cycle), rate(_rate), dupThresh(_dupThresh),
  hybrid(_hybrid), vidThresh(_vidThresh),
  conCycleTP(_conCycleTP), vidDetect(_vidDetect), sceneThresh(_sceneThresh),
  conCycle(_conCycle), ovr(_ovr), input(_input),
  nt(_nt), output(_output), mkvOut(_mkvOut), tfmIn(_tfmIn), blockx(_blockx), blocky(_blocky),
  vfrDec(_vfrDec), debug(_debug), display(_display), batch(_batch), tcfv1(_tcfv1), se(_se),
  maxndl(_maxndl), chroma(_chroma), m2PA(_m2PA), exPP(_exPP),
  noblend(_noblend), predenoise(_predenoise), ssd(_ssd), sdlim(_sdlim),
  opt(_opt), clip2(_clip2), orgOut(_orgOut),
  prev(5, 0), curr(5, 0), next(5, 0), nbuf(5, 0), usehints(_usehints), diff(nullptr, nullptr)
{
    vi_child = vsapi->getVideoInfo(child);
    vi = *vi_child;

  mkvOutF = nullptr;
  FILE *f = nullptr;
  char linein[1024], *linep, *linet;
  
  bool tfmFullInfo = false, metricsFullInfo = false;
  
  fps = (double)vi.fpsNum / (double)vi.fpsDen;

  cpuFlags = *getCPUFeatures();
  if (opt == 0) memset(&cpuFlags, 0, sizeof(cpuFlags));

  if (!vi.format)
      throw TIVTCError("TDecimate: the clip must have constant format.");

  if (vi.width == 0 || vi.height == 0)
      throw TIVTCError("TDecimate: the clip must have constant dimensions.");

  if (vi.format->bitsPerSample > 16)
    throw TIVTCError("TDecimate:  only 8-16 bit formats supported!");
  if (vi.format->colorFamily != cmYUV)
    throw TIVTCError("TDecimate:  YUV colorspaces only!");
  if (mode < 0 || mode > 7)
    throw TIVTCError("TDecimate:  mode must be set to 0, 1, 2, 3, 4, 5, 6, or 7!");
  if (mode == 3 && mkvOut.empty())
    throw TIVTCError("TDecimate:  an mkvOut file must be specified in mode 3!");
  if (mode == 5 && mkvOut.empty())
    throw TIVTCError("TDecimate:  an mkvOut file must be specified in mode 5!");
  if (mode == 6 && mkvOut.empty())
    throw TIVTCError("TDecimate:  an mkvOut file must be specified in mode 6!");
  if (hybrid < 0 || hybrid > 3)
    throw TIVTCError("TDecimate:  hybrid must be set to 0, 1, 2, or 3!");
  if (mode == 3 && hybrid != 2)
    throw TIVTCError("TDecimate:  mode 3 can only be used with hybrid = 2!");
  if (mode == 5 && hybrid != 2)
    throw TIVTCError("TDecimate:  mode 5 can only be used with hybrid = 2!");
  if (mode == 6 && hybrid != 2)
    throw TIVTCError("TDecimate:  mode 6 can only be used with hybrid = 2!");
  if (hybrid == 3 && mode > 1)
    throw TIVTCError("TDecimate:  hybrid = 3 can only be used with modes 0 and 1!");
  if (hybrid == 1 && mode > 1)
    throw TIVTCError("TDecimate:  hybrid = 1 can only be used with modes 0 and 1!");
  if (hybrid > 0 && cycleR > 1)
    throw TIVTCError("TDecimate:  hybrid processing is currently limited to cycleR=1 cases only!");
  if (mode < 2 && hybrid > 1 && hybrid != 3)
    throw TIVTCError("TDecimate:  only hybrid = 0, 1, or 3 is supported in modes 0 and 1!");
  if (cycleR >= cycle || cycleR <= 0)
    throw TIVTCError("TDecimate:  cycleR must be greater than 0 and less than cycle!");
  if (cycle < 2 || cycle > vi.numFrames)
    throw TIVTCError("TDecimate:  cycle must be at least 2 and less than or equal to the number of frames in the clip!");
  if (sceneThresh < 0.0 || sceneThresh > 100.0)
    throw TIVTCError("TDecimate:  sceneThresh must be in the range 0 to 100!");
  if (rate >= fps && (mode == 2 || mode == 7))
    throw TIVTCError("TDecimate:  mode 2 and 7 - new rate must be less than current rate!");
  if (vidDetect < 0 || vidDetect > 4)
    throw TIVTCError("TDecimate:  vidDetect must be set to 0, 1, 2, 3, or 4!");
  if (conCycle > 2)
    throw TIVTCError("TDecimate:  conCycle cannot be greater than 2!");
  if (mode == 4 && (ovr.size() || tfmIn.size()))
    throw TIVTCError("TDecimate:  cannot use an ovr or tfmIn file when in mode 4!");
  if (vfrDec != 0 && vfrDec != 1)
    throw TIVTCError("TDecimate:  vfrDec must be set to 0 or 1!");
  if (output.size() && (mode == 5 || mode == 6))
    throw TIVTCError("TDecimate:  output not supported in mode 5 and 6 (you should already have the metrics)!");
  if (blockx != 4 && blockx != 8 && blockx != 16 && blockx != 32 && blockx != 64 &&
    blockx != 128 && blockx != 256 && blockx != 512 && blockx != 1024 && blockx != 2048)
    throw TIVTCError("TDecimate:  illegal blockx size!");
  if (blocky != 4 && blocky != 8 && blocky != 16 && blocky != 32 && blocky != 64 &&
    blocky != 128 && blocky != 256 && blocky != 512 && blocky != 1024 && blocky != 2048)
    throw TIVTCError("TDecimate:  illegal blocky size!");
  if (mode == 2 && maxndl != -200 && (maxndl < 1 || maxndl > 99))
    throw TIVTCError("TDecimate:  maxndl must be set to a value between 1 and 99 inclusive!");
  if ((mode != 0 && mode != 1 && mode != 3) || cycleR == 1)
    sdlim = 0;
  if ((abs(sdlim) + 1)*(cycleR - 1) >= cycle) {
      char msg[160] = { 0 };
    snprintf(msg, 160, "TDecimate:  invalid sdlim setting (%d through %d (inclusive) are allowed)!", 0, int(ceil(cycle / double(cycleR - 1))) - 2);
    throw TIVTCError(msg);
  }
  if (opt < 0 || opt > 4)
    throw TIVTCError("TDecimate:  opt must be set to 0, 1, 2, 3, or 4!");

  vi_clip2 = vsapi->getVideoInfo(clip2);

  if (vi.numFrames != vi_clip2->numFrames)
    throw TIVTCError("TDecimate:  clip2 must have the same number of frames as the input clip!");
  if (vi_clip2->format->colorFamily != cmYUV)
    throw TIVTCError("TDecimate:  clip2 must be YUV colorspace!");
  if (vi_clip2->format->bitsPerSample > 16)
    throw TIVTCError("TDecimate:  clip2: only 8-16 bit formats supported!");

//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  %s by tritical\n", VERSION);
//    OutputDebugString(buf);
//  }
  ecf = false;
  if (cycle > 5 && mode != 4 && mode != 6 && mode != 7)
  {
    prev.setSize(cycle);
    curr.setSize(cycle);
    next.setSize(cycle);
    nbuf.setSize(cycle);
  }
  if (sdlim)
  {
    prev.sdlim = sdlim;
    curr.sdlim = sdlim;
    next.sdlim = sdlim;
    nbuf.sdlim = sdlim;
  }
  if (mode == 4 || mode == 5 || mode == 6) {
//    child->SetCacheHints(CACHE_GENERIC, 3);
  }
  else if (mode != 2 && mode != 7)
  {
    int cacheRange = cycle * 4 + 1;
    if (cacheRange < 1) cacheRange = 1;
    if (input.size() || cycle >= 26)
    {
//      if (cacheRange > 100)
//        child->SetCacheHints(CACHE_GENERIC, 100);
//      else
//        child->SetCacheHints(CACHE_GENERIC, cacheRange);
    }
    else
    {
      ecf = true; // ecf is not used anywhere. It had to do with the cache manipulation, which we can't do in VapourSynth.
//      child->SetCacheHints(0, -20);
    }
  }

  if (vidDetect == 4)
  {
    vidDetect = 3;
    cve = true;
  }
  else cve = false;
  lastn = -1;
  fullInfo = false;
  same_thresh = diff_thresh = 0;
  linearCount = -342;
  mode2_num = mode2_den = mode2_numCycles = -20;
  memset(mode2_cfs, 0, 10 * sizeof(int));
  nfrms = nfrmsN = vi.numFrames - 1;
  prev.length = curr.length = next.length = nbuf.length = cycle;
  prev.maxFrame = curr.maxFrame = next.maxFrame = nbuf.maxFrame = nfrms;
  blockx_shift = blockx == 4 ? 2 : blockx == 8 ? 3 : blockx == 16 ? 4 : blockx == 32 ? 5 :
    blockx == 64 ? 6 : blockx == 128 ? 7 : blockx == 256 ? 8 : blockx == 512 ? 9 :
    blockx == 1024 ? 10 : 11;
  blocky_shift = blocky == 4 ? 2 : blocky == 8 ? 3 : blocky == 16 ? 4 : blocky == 32 ? 5 :
    blocky == 64 ? 6 : blocky == 128 ? 7 : blocky == 256 ? 8 : blocky == 512 ? 9 :
    blocky == 1024 ? 10 : 11;
  blocky_half = blocky >> 1;
  blockx_half = blockx >> 1;

  char error[512] = "TDecimate: Couldn't fetch the first frame from the input clip to read TFM's PP value. Reason: ";
  size_t len = strlen(error);

  const VSFrameRef *first_frame = vsapi->getFrame(0, child, error + len, 512 - len);
  if (first_frame == nullptr)
      throw TIVTCError(error);

  const VSMap *props = vsapi->getFramePropsRO(first_frame);

  int err;
  int64_t TFMPP = vsapi->propGetInt(props, PROP_TFMPP, 0, &err);
  vsapi->freeFrame(first_frame);
  if (err)
      useTFMPP = false;
  else
      useTFMPP = TFMPP > 1;

  if (exPP) useTFMPP = true;


    if (chroma)
    {
      const int blockx_chroma = blockx >> vi.format->subSamplingW;
      const int blocky_chroma = blocky >> vi.format->subSamplingH;
      if (ssd) 
        MAX_DIFF = (uint64_t)(sqrt(219.0*219.0*blockx*blocky + 224.0*224.0* blockx_chroma * blocky_chroma *2.0));
      else 
        MAX_DIFF = (uint64_t)(219.0*blockx*blocky + 224.0*blockx_half*blocky_half*2.0);
    }
    else
    {
      if (ssd) 
        MAX_DIFF = (uint64_t)(sqrt(219.0*219.0*blockx*blocky));
      else
        MAX_DIFF = (uint64_t)(219.0*blockx*blocky);
    }
    if (ssd)
    {
      sceneThreshU = (uint64_t)((sceneThresh*sqrt(219.0*219.0*vi.height*vi.width)) / 100.0);
      sceneDivU = (uint64_t)(sqrt(219.0*219.0*vi.width*vi.height));
    }
    else
    {
      sceneThreshU = (uint64_t)((sceneThresh*219.0*vi.height*vi.width) / 100.0);
      sceneDivU = (uint64_t)(219.0*vi.width*vi.height);
    }


  if (mode <= 5 || mode == 7)
  {
    diff = decltype(diff) (vs_aligned_malloc<uint64_t>((((vi.width + blockx_half) >> blockx_shift) + 1)*(((vi.height + blocky_half) >> blocky_shift) + 1) * 4 * sizeof(uint64_t), 16), &vs_aligned_free);
    if (diff == nullptr) throw TIVTCError("TDecimate:  malloc failure (diff)!");
  }
  if (output.size())
  {
    if ((f = tivtc_fopen(output.c_str(), "w")) != nullptr)
    {
      _fullpath(outputFull, output.c_str(), MAX_PATH);
      calcCRC(child, 15, outputCrc, vsapi);
      fclose(f);
      f = nullptr;
      metricsOutArray.resize(vi.numFrames * 2, UINT64_MAX);
    }
    else throw TIVTCError("TDecimate:  output error (cannot create output file)!");
  }
  if (input.size())
  {
    metricsArray.resize(vi.numFrames * 2);

    for (int h = 0; h < vi.numFrames * 2; ++h)
    {
      if (!batch || (mode != 5 && mode != 6)) metricsArray[h] = UINT64_MAX;
      else metricsArray[h] = 0;
    }
    if ((f = tivtc_fopen(input.c_str(), "r")) != nullptr)
    {
      uint64_t metricU, metricF;
      int w;
      while (fgets(linein, 1024, f) != nullptr)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == '#' || linein[0] == ';')
          continue;
        linep = linein;
        while (*linep != ' ' && *linep != 0 && *linep != 'c') linep++;
        if (*linep == 'c')
        {
          if (_strnicmp(linein, "crc32 = ", 8) == 0)
          {
            linet = linein;
            while (*linet != ' ') linet++;
            linet++;
            while (*linet != ' ') linet++;
            linet++;
            unsigned int z, tempCrc;
            sscanf(linet, "%x", &z);
            calcCRC(child, 15, tempCrc, vsapi);
            if (tempCrc != z && !batch)
            {
              fclose(f);
              f = nullptr;
              char msg[160] = { 0 };
              snprintf(msg, 160, "TDecimate:  crc32 in input file does not match that of the current clip (%#x vs %#x)!",
                z, tempCrc);
              throw TIVTCError(msg);
            }
            linep = linein;
            while (*linep != ',' && linep != 0) linep++;
            if (*linep == 0) continue;
            linep++; linep++;
            int j;
            if (_strnicmp(linep, "blockx = ", 9) == 0)
            {
              while (*linep != '=') linep++;
              linep++; linep++;
              sscanf(linep, "%d", &j);
              if (j != blockx)
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  current blockx value does not match" \
                  " that which was used to create the given input file!");
              }
            }
            linep = linein;
            while (*linep != ',' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++;
            while (*linep != ',' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++; linep++;
            if (_strnicmp(linep, "blocky = ", 9) == 0)
            {
              while (*linep != '=') linep++;
              linep++; linep++;
              sscanf(linep, "%d", &j);
              if (j != blocky)
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  current blocky value does not match" \
                  " that which was used to create the given input file!");
              }
            }
            linep = linein;
            while (*linep != ',' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++;
            while (*linep != ',' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++;
            while (*linep != ',' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++; linep++;

            char ch;
            if (_strnicmp(linep, "chroma = ", 9) == 0)
            {
              while (*linep != '=') linep++;
              linep++; linep++;
              sscanf(linep, "%c", &ch);
              if (((ch == 'T' || ch == 't') && !chroma) || ((ch == 'F' || ch == 'f') && chroma))
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  current chroma setting does not match" \
                  " that which was used to create the given input file!");
              }
            }
          }
        }
        else if (*linep == ' ' && *(linep + 1) != 0 && *(linep + 1) != ' ')
        {
          sscanf(linein, "%d %" PRIu64 " %" PRIu64 "", &w, &metricU, &metricF);
          if (w < 0 || w > nfrms)
          {
            fclose(f);
            f = nullptr;
            throw TIVTCError("TDecimate:  input error (out of range frame #)!");
          }
          metricsArray[w * 2] = metricU;
          metricsArray[w * 2 + 1] = metricF;
        }
      }
      fclose(f);
      f = nullptr;
      metricsFullInfo = true;
      for (int h = 0; h < vi.numFrames * 2; h += 2)
      {
        if (metricsArray[h] == UINT64_MAX)
        {
          metricsFullInfo = false;
          if ((mode == 5 || mode == 6) && !batch)
          {
            throw TIVTCError("TDecimate:  input error (mode 5 and 6, all frames must have entries)!");
          }
        }
      }
    }
    else
    {
      throw TIVTCError("TDecimate:  input error (cannot open input file)!");
    }
  }
  else if (mode == 5)
  {
    metricsArray.resize(vi.numFrames * 2);

    for (int h = 0; h < vi.numFrames * 2; h += 2)
    {
      metricsArray[h + 0] = UINT64_MAX - 1;
      metricsArray[h + 1] = 0;
    }
  }
  if (ovr.size())
  {
    if ((f = tivtc_fopen(ovr.c_str(), "r")) != nullptr)
    {
      if (ovrArray.empty())
      {
        ovrArray.resize(vi.numFrames);
        if (!batch || (mode != 5 && mode != 6)) memset(ovrArray.data(), 112, vi.numFrames);
        else memset(ovrArray.data(), 0, vi.numFrames);
      }
      int q, w, z, count = 0;
      while (fgets(linein, 1024, f) != 0)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
          continue;
        linep = linein;
        while (*linep != 0 && *linep != ' ' && *linep != ',') linep++;
        if (*linep == ' ')
        {
          linet = linein;
          while (*linet != 0)
          {
            if (*linet != ' ' && *linet != 10) break;
            linet++;
          }
          if (*linet == 0) continue;
          linep++;
          if (*linep == '-' || *linep == '+')
          {
            sscanf(linein, "%d", &z);
            if (z<0 || z>nfrms)
            {
              fclose(f);
              f = nullptr;
              throw TIVTCError("TDecimate:  ovr file error (out of range frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep != 0)
            {
              linep++;
              q = *linep;
              if (q == 45) q = DROP_FRAME;
              else if (q == 43) q = KEEP_FRAME;
              else
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  ovr file error (invalid specifier)!");
              }
              ovrArray[z] &= 0xFC;
              ovrArray[z] |= q;
            }
          }
          else if (*linep == 'f' || *linep == 'v')
          {
            sscanf(linein, "%d", &z);
            if (z<0 || z>nfrms)
            {
              fclose(f);
              f = nullptr;
              throw TIVTCError("TDecimate:  ovr file error (out of range frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep != 0)
            {
              linep++;
              q = *linep;
              if (q == 102) q = FILM;
              else if (q == 118) q = VIDEO;
              else
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  ovr file error (invalid symbol)!");
              }
              ovrArray[z] &= 0xF3;
              ovrArray[z] |= q;
            }
          }
        }
        else if (*linep == ',')
        {
          while (*linep != ' ' && *linep != 0) linep++;
          if (*linep == 0) continue;
          linep++;
          if (*linep == 'f' || *linep == 'v')
          {
            sscanf(linein, "%d,%d", &z, &w);
            if (w == 0) w = nfrms;
            if (z<0 || z>nfrms || w<0 || w>nfrms || w < z)
            {
              fclose(f);
              f = nullptr;
              throw TIVTCError("TDecimate:  input file error (out of range frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep != 0)
            {
              linep++;
              q = *linep;
              if (q == 102) q = FILM;
              else if (q == 118) q = VIDEO;
              else
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  input file error (invalid specifier)!");
              }
              while (z <= w)
              {
                ovrArray[z] &= 0xF3;
                ovrArray[z] |= q;
                ++z;
              }
            }
          }
          else if (*linep == '-' || *linep == '+')
          {
            sscanf(linein, "%d,%d", &z, &w);
            if (w == 0) w = nfrms;
            if (z<0 || z>nfrms || w<0 || w>nfrms || w < z)
            {
              fclose(f);
              f = nullptr;
              throw TIVTCError("TDecimate:  input file error (out of range frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            linep++;
            if (*(linep + 1) == '-' || *(linep + 1) == '+')
            {
              count = 0;
              while ((*linep == '-' || *linep == '+') && (z + count <= w))
              {
                q = *linep;
                if (q == 45) q = DROP_FRAME;
                else if (q == 43) q = KEEP_FRAME;
                else
                {
                  fclose(f);
                  f = nullptr;
                  throw TIVTCError("TDecimate:  input file error (invalid specifier)!");
                }
                ovrArray[z + count] &= 0xFC;
                ovrArray[z + count] |= q;
                ++count;
                linep++;
              }
              while (z + count <= w)
              {
                ovrArray[z + count] &= 0xFC;
                ovrArray[z + count] |= (ovrArray[z] & 0x03);
                ++z;
              }
            }
            else
            {
              q = *linep;
              if (q == 45) q = DROP_FRAME;
              else if (q == 43) q = KEEP_FRAME;
              else
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimatee:  input file error (invalid specifier)!");
              }
              while (z <= w)
              {
                ovrArray[z] &= 0xFC;
                ovrArray[z] |= q;
                ++z;
              }
            }
          }
        }
      }
      fclose(f);
      f = nullptr;
    }
    else throw TIVTCError("TDecimate:  ovr error (could not open ovr file)!");
  }
  if (tfmIn.size())
  {
    bool d2vmarked, micmarked;
    if ((f = tivtc_fopen(tfmIn.c_str(), "r")) != nullptr)
    {
      int fieldt, firstLine, z, q, r;
      if (ovrArray.empty())
      {
        ovrArray.resize(vi.numFrames);
        if (!batch || mode != 5) memset(ovrArray.data(), 112, vi.numFrames);
        else memset(ovrArray.data(), 0, vi.numFrames);
      }
      fieldt = firstLine = 0;
      while (fgets(linein, 1024, f) != nullptr)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
          continue;
        ++firstLine;
        linep = linein;
        while (*linep != 'f' && *linep != 'F' && *linep != 0 && *linep != ' ' && *linep != 'c') linep++;
        if (*linep == 'f' || *linep == 'F')
        {
          if (firstLine == 1)
          {
            if (_strnicmp(linein, "field = top", 11) == 0) fieldt = 1;
            else if (_strnicmp(linein, "field = bottom", 14) == 0) fieldt = 0;
          }
        }
        else if (*linep == ' ')
        {
          linet = linein;
          while (*linet != 0)
          {
            if (*linet != ' ' && *linet != 10) break;
            linet++;
          }
          if (*linet == 0) { --firstLine; continue; }
          sscanf(linein, "%d", &z);
          linep = linein;
          while (*linep != 'p' && *linep != 'c' && *linep != 'n' && *linep != 'u' &&
            *linep != 'b' && *linep != 'l' && *linep != 'h' && *linep != 0) linep++;
          if (*linep != 0)
          {
            if (z<0 || z>nfrms)
            {
              fclose(f);
              f = nullptr;
              throw TIVTCError("TDecimate:  tfmIn file error (out of range frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep != 0)
            {
              linep++;
              q = *linep;
              if (q == 112) q = 0;
              else if (q == 99) q = 1;
              else if (q == 110) q = 2;
              else if (q == 98) q = 3;
              else if (q == 117) q = 4;
              else if (q == 108) q = 5;
              else if (q == 104) q = 6;
              else
              {
                fclose(f);
                f = nullptr;
                throw TIVTCError("TDecimate:  tfmIn file error (invalid match specifier)!");
              }
              if (fieldt != 0)
              {
                if (q == 0) q = 3;
                else if (q == 2) q = 4;
                else if (q == 3) q = 0;
                else if (q == 4) q = 2;
              }
              d2vmarked = micmarked = false;
              linep++;
              while (*linep == ' ' && *linep != 0 && *linep != 10) linep++;
              if (*linep != 0 && *linep != 10)
              {
                r = *linep;
                if (r == 45 && useTFMPP)
                {
                  // intentional noop q = q; 
                }
                else if (r == 43 && q < 5 && useTFMPP)
                {
                  if (fieldt == 0) q = 5;
                  else q = 6;
                }
                else if (r == '1') d2vmarked = true;
                else if (r == '[') micmarked = true;
                else if (r != 43 && r != 45)
                {
                  fclose(f);
                  f = nullptr;
                  throw TIVTCError("TDecimate:  tfmIn file error (invalid specifier)!");
                }
              }
              if (!d2vmarked && !micmarked && *linep != 0 && *linep != 10)
              {
                linep++;
                while (*linep == ' ' && *linep != 0 && *linep != 10) linep++;
                if (*linep != 0 && *linep != 10)
                {
                  r = *linep;
                  if (r == '1') d2vmarked = true;
                }
              }
              if (d2vmarked) ovrArray[z] |= ISD2VFILM;
              ovrArray[z] |= 0x70;
              ovrArray[z] &= ((q << 4) | 0x8F);
            }
          }
        }
      }
      fclose(f);
      f = nullptr;
      tfmFullInfo = true;
      for (int h = 0; h < vi.numFrames; ++h)
      {
        if ((ovrArray[h] & ISMATCH) == 0x70)
        {
          tfmFullInfo = false;
          if (mode == 5 && !batch)
          {
            throw TIVTCError("TDecimate:  tfmIn error (mode 5, all frames must have an entry)!");
          }
        }
      }
    }
    else throw TIVTCError("TDecimate:  tfmIn file error (could not open file)!");
  }
  else if (mode == 5)
  {
    if (ovrArray.empty())
    {
      ovrArray.resize(vi.numFrames, 16);
    }
    else
    {
      for (int i = 0; i < vi.numFrames; ++i)
      {
        ovrArray[i] |= 0x70;
        ovrArray[i] &= ((1 << 4) | 0x8F);
      }
    }
  }

  if (metricsFullInfo && (tfmFullInfo || !usehints)) fullInfo = true;
  else fullInfo = false;

  if (mode < 2)
  {
    if (hybrid != 3)
    {
      vi.numFrames = (vi.numFrames * (cycle - cycleR)) / cycle;
      nfrmsN = vi.numFrames - 1;
      muldivRational(&vi.fpsNum, &vi.fpsDen, cycle - cycleR, cycle);
    }
    else nfrmsN = vi.numFrames - 1;
  }
  else if (mode == 2)
  {
    if (metricsOutArray.empty())
    {
      metricsOutArray.resize(vi.numFrames * 2, UINT64_MAX);
    }
    mode2_decA.resize(vi.numFrames, -20);

    double arate = buildDecStrategy();
    if (mode2_numCycles > 0)
    {
      if (curr.length < 0)
        throw TIVTCError("TDecimate:  unknown error with mode 2!");
//      if (curr.length <= 50)
//        child->SetCacheHints(CACHE_GENERIC, (curr.length * 2) + 1);
//      else
//        child->SetCacheHints(CACHE_GENERIC, 100);
      mode2_order.resize(std::max(curr.length + 10, 100));
      mode2_metrics.resize(std::max(curr.length + 10, 100));
    }
    else {
//      child->SetCacheHints(CACHE_GENERIC, 3);  // fixed to diameter (07/30/2005)
    }
    unsigned int num, den;
    if (FloatToFPS(arate, num, den))
        throw TIVTCError("TDecimate:  rate value is out of range.");
    vi.fpsNum = num;
    vi.fpsDen = den;
    vi.numFrames = (int)(vi.numFrames * (arate / fps));
    nfrmsN = vi.numFrames - 1;
  }
  else if (mode == 7)
  {
    if (metricsOutArray.empty())
    {
      metricsOutArray.resize(vi.numFrames * 2, UINT64_MAX);
      metricsOutArray[0] = 0;
    }
    if (aLUT.size()) aLUT.resize(0);
    aLUT.resize(vi.numFrames, -20);

    if (rate <= 0)
        throw TIVTCError("TDecimate:  rate must be greater than 0.");
    unsigned int num, den;
    if (FloatToFPS(rate, num, den))
        throw TIVTCError("TDecimate:  rate value is out of range.");
    vi.fpsNum = num;
    vi.fpsDen = den;
    vi.numFrames = (int)(vi.numFrames * (rate / fps));
    nfrmsN = vi.numFrames - 1;
    mode2_decA.resize(vi.numFrames, -20);

//    child->SetCacheHints(CACHE_GENERIC, int((fps / rate) + 1.0) * 2 + 3);  // fixed to diameter (07/30/2005)
    diff_thresh = uint64_t((vidThresh*MAX_DIFF) / 100.0);
    same_thresh = uint64_t((dupThresh*MAX_DIFF) / 100.0);
  }
  else if (mode == 3)
  {
    mkvfps = (fps*(cycle - cycleR)) / cycle;
    mkvfps2 = (fps*(cycle - cycleR - 1)) / cycle;
    lastGroup = -1;
    lastCycle = -cycle;
    retFrames = -200;
    lastType = linearCount = 0;
    if ((mkvOutF = tivtc_fopen(mkvOut.c_str(), "w")) != nullptr)
    {
      if (tcfv1)
      {
        fprintf(mkvOutF, "# timecode format v1\n");
        fprintf(mkvOutF, "Assume %4.6f\n", fps);
      }
      else fprintf(mkvOutF, "# timecode format v2\n");
      fprintf(mkvOutF, "# TDecimate %s by tritical\n", VERSION);
      fprintf(mkvOutF, "# Mode 3 - Auto-generated mkv timecodes file\n");
    }
    else throw TIVTCError("TDecimate:  mode 3 error (cannot create mkvOut file)!");
  }
  else if (mode == 5)
  {
    init_mode_5(core);
    diff = nullptr; // mode 5 is using diff buffer only at init
  } // mode 5
  else if (mode == 6)
  {
    std::vector<int> input_magic_numbers(vi.numFrames, 0);

    int j = 0, k = 0, frm = 0, dups, frameDen;
    double timestamp = 0.0;
    int lastt = 0, lastf = 0;
    if ((f = tivtc_fopen(mkvOut.c_str(), "w")) == nullptr)
    {
      throw TIVTCError("TDecimate:  unable to create mkvOut file!");
    }
    if (tcfv1)
    {
      fprintf(f, "# timecode format v1\n");
      fprintf(f, "Assume 23.976024\n");
    }
    else fprintf(f, "# timecode format v2\n");
    fprintf(f, "# TDecimate %s by tritical\n", VERSION);
    fprintf(f, "# Mode 6 - Auto-generated mkv timecodes file\n");
    
    vi.fpsNum = 0;
    vi.fpsDen = 0;
    while (j < vi.numFrames)
    {
      dups = 1;
      ++j;
      while (j < vi.numFrames && metricsArray[j * 2] == 0)
      {
        ++dups;
        ++j;
      }
      while (dups > 0)
      {
        if (dups == 1) // 119.88012
        {
          if (!tcfv1)
          {
            fprintf(f, "%3.6f\n", timestamp*1000.0);
            timestamp += 0.00834166665833;
          }
          else if (lastt != 1 && lastt > 0)
          {
            if (lastt != 5) fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 1;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 1;
          input_magic_numbers[j - 1] = 2;
          frameDen = 120000;
          dups = 0;
          ++k;
        }
        else if (dups == 2) // 59.94006
        {
          if (!tcfv1)
          {
            fprintf(f, "%3.6f\n", timestamp*1000.0);
            timestamp += 0.01668333331665;
          }
          else if (lastt != 2 && lastt > 0)
          {
            if (lastt != 5) fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 2;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 2;
          input_magic_numbers[j - 2] = 2;
          frameDen = 60000;
          dups = 0;
          ++k;
        }
        else if (dups == 3) // 39.96004
        {
          if (!tcfv1)
          {
            fprintf(f, "%3.6f\n", timestamp*1000.0);
            timestamp += 0.02502499997498;
          }
          else if (lastt != 3 && lastt > 0)
          {
            if (lastt != 5) fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 3;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 3;
          input_magic_numbers[j - 3] = 2;
          frameDen = 40000;
          dups = 0;
          ++k;
        }
        else if ((dups % 4) == 0) // 29.97003
        {
          if (!tcfv1)
          {
            int i, repeat = dups >> 2;
            for (i = 0; i < repeat; ++i)
            {
              fprintf(f, "%3.6f\n", timestamp*1000.0);
              timestamp += 0.03336666663330;
            }
          }
          else if (lastt != 4 && lastt > 0)
          {
            if (lastt != 5) fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 4;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 4;
          k += (dups >> 2);
          for (int i = 0; i < dups; i += 4) input_magic_numbers[j - dups + i] = 2;
          frameDen = 30000;
          dups = 0;
        }
        else if ((dups % 5) == 0) // 23.97602
        {
          if (!tcfv1)
          {
            int i, repeat = dups / 5;
            for (i = 0; i < repeat; ++i)
            {
              fprintf(f, "%3.6f\n", timestamp*1000.0);
              timestamp += 0.04170834024997;
            }
          }
          else if (lastt != 5 && lastt > 0)
          {
            fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 5;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 5;
          k += (dups / 5);
          for (int i = 0; i < dups; i += 5) input_magic_numbers[j - dups + i] = 2;
          frameDen = 24000;
          dups = 0;
        }
        else if (dups > 5)
        {
          if (!tcfv1)
          {
            fprintf(f, "%3.6f\n", timestamp*1000.0);
            timestamp += 0.04170834024997;
          }
          else if (lastt != 5 && lastt > 0)
          {
            fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
            lastt = 5;
            lastf = k;
          }
          else if (lastt <= 0) lastt = 5;
          input_magic_numbers[j - dups] = 2;
          dups -= 5;
          frameDen = 24000;
          ++k;
        }
      }
      while (frm < j)
      {
        frame_duration_info[frm].first = 1001;
        frame_duration_info[frm].second = frameDen;
        ++frm;
      }
    }
    if (tcfv1 && lastt != 5) fprintf(f, "%d,%d,%s\n", lastf, k - 1, cfps(lastt));
    vi.numFrames = k;
    if (aLUT.size()) { aLUT.resize(0); }
    aLUT.resize(vi.numFrames + 1, 0);

    k = j = 0;
    while (k <= nfrms && j <= vi.numFrames - 1)
    {
      if (input_magic_numbers[k] == 2)
      {
        aLUT[j] = k;
        ++j;
      }
      ++k;
    }

    fclose(f);
    f = nullptr;
    nfrmsN = vi.numFrames - 1;
  } // mode 6
  if (f != nullptr) fclose(f);

    vi.width = vi_clip2->width;
    vi.height = vi_clip2->height;
    vi.format = vi_clip2->format;
}

TDecimate::~TDecimate()
{
  if (metricsOutArray.size())
  {
    if (output.size())
    {
      FILE *f = nullptr;
      if ((f = tivtc_fopen(outputFull, "w")) != nullptr)
      {
        uint64_t metricU, metricF;
        fprintf(f, "#TDecimate %s by tritical\n", VERSION);
        fprintf(f, "crc32 = %x, blockx = %d, blocky = %d, chroma = %c\n", outputCrc, blockx, blocky,
          chroma ? 'T' : 'F');
        for (int h = 0; h < (nfrms + 1) * 2; h += 2)
        {
          metricU = metricF = UINT64_MAX;
          if (metricsOutArray[h] != UINT64_MAX) metricU = metricsOutArray[h];
          if (metricsOutArray[h + 1] != UINT64_MAX) metricF = metricsOutArray[h + 1];
          if (metricU != UINT64_MAX || metricF != UINT64_MAX)
            fprintf(f, "%d %" PRIu64 " %" PRIu64 "\n", h >> 1, metricU, metricF);
        }
        fclose(f);
        f = nullptr;
      }
      if (f != nullptr) fclose(f);
    }
  }
  if (mkvOutF != nullptr) fclose(mkvOutF);

  vsapi->freeNode(child);
  vsapi->freeNode(clip2);
}
07070100000008000081A4000000000000000000000001671240C900002435000000000000000000000000000000000000002F00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimate.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#ifndef TDECIMATE_H
#define TDECIMATE_H

#include <stdio.h>
#include <math.h>
#ifndef _WIN32
#include <limits.h>
#include <stdlib.h>
#include <strings.h>
#define _strnicmp strncasecmp
#define _fullpath(absolute, relative, max) realpath((relative), (absolute))
#define MAX_PATH PATH_MAX
#else
#include <windows.h>
#endif
#include <memory>
#include <vector>
#include <string>
#include <unordered_map>
#include <VapourSynth.h>
#include <VSHelper.h>

#include "internal.h"
//#include "Font.h"
#include "Cycle.h"
#include "calcCRC.h"
//#include "profUtil.h"
//#include "Cache.h"
#include "cpufeatures.h"

enum {
    RetFrameIsReady = 69,
};

// All the rest of this code was just copied from tdecimate.cpp because I'm
// too lazy to make it work such that it could call that code.
// pinterf 2020: moved the three versions to common codebase again: CalcMetricsExtracted().
struct CalcMetricData {
  bool predenoise;
  VSVideoInfo vi;
  bool chroma;
  const CPUFeatures *cpuFlags;
  int blockx;
  int blockx_half;
  int blockx_shift;
  int blocky;
  int blocky_half;
  int blocky_shift;
  uint64_t* diff;
  int nt;
  bool ssd; // ssd or sad

  bool metricF_needed; // from TDecimate: true, from FrameDiff: false
  // TDecimate
  uint64_t* metricF; // out!
  bool scene;
};

void CalcMetricsExtracted(const VSFrameRef *prevt, const VSFrameRef *currt, CalcMetricData& d, VSCore *core, const VSAPI *vsapi);

void blurFrame(const VSFrameRef *src, VSFrameRef *dst, int iterations,
  bool bchroma, const CPUFeatures *cpuFlags, VSCore *core, const VSAPI *vsapi);

uint64_t calcLumaDiffYUY2_SSD(const uint8_t* prvp, const uint8_t* nxtp,
  int width, int height, int prv_pitch, int nxt_pitch, int nt, int cpuFlags);

uint64_t calcLumaDiffYUY2_SAD(const uint8_t* prvp, const uint8_t* nxtp,
  int width, int height, int prv_pitch, int nxt_pitch, int nt, int cpuFlags);

class TDecimate
{
private:
    const VSAPI *vsapi;
    VSNodeRef *child;
    const VSVideoInfo *vi_child;
    const VSVideoInfo *vi_clip2;

  CPUFeatures cpuFlags;

  int mode;
  int cycleR, cycle;
  double rate, dupThresh;
  int hybrid;
  double vidThresh;
  int conCycleTP;
  int vidDetect;
  double sceneThresh;
  int conCycle;
  std::string ovr;
  std::string input;
  int nt;
  std::string output;
  std::string mkvOut;
  std::string tfmIn;
  int blockx, blocky;
  int vfrDec;
  bool debug, display;
  bool batch;
  bool tcfv1;
  bool se;
  int maxndl;
  bool chroma;
  bool m2PA;
  bool exPP;
  bool noblend;
  bool predenoise;
  bool ssd; // sum of squared distances (false = SAD)
  int sdlim;
  int opt;
  VSNodeRef *clip2;
  std::string orgOut;
  Cycle prev, curr, next, nbuf;

  int nfrms, nfrmsN, linearCount;
  int blocky_shift, blockx_shift, blockx_half, blocky_half;
  int lastn;
  int lastFrame, lastCycle, lastGroup, lastType, retFrames;
  uint64_t MAX_DIFF, sceneThreshU, sceneDivU, diff_thresh, same_thresh;
  double fps, mkvfps, mkvfps2;
  bool useTFMPP, cve, ecf, fullInfo;
  bool usehints;
  std::unique_ptr<uint64_t, decltype (&vs_aligned_free)> diff;
  std::vector<uint64_t> metricsArray, metricsOutArray, mode2_metrics;
  std::vector<int> aLUT, mode2_decA, mode2_order;
  std::unordered_map<int, std::pair<int, int>> frame_duration_info;
  unsigned int outputCrc;
  std::vector<uint8_t> ovrArray;
  int mode2_num, mode2_den, mode2_numCycles, mode2_cfs[10];
  FILE *mkvOutF;
  char outputFull[MAX_PATH];

  void init_mode_5(VSCore *core);
  void rerunFromStart(const int s, VSFrameContext *frameCtx, VSCore *core);
  void checkVideoMetrics(Cycle &c, double thresh);
  void checkVideoMatches(Cycle &p, Cycle &c);
  bool checkMatchDup(int mp, int mc);
  void findDupStrings(Cycle &p, Cycle &c, Cycle &n);

  int getTFMFrameProperties(const VSFrameRef *src, int& d2vfilm) const;
//  template<typename pixel_t>
//  int getHint_core(const VSFrameRef *src, int &d2vfilm);

//  template<typename pixel_t>
//  void restoreHint(const VSFrameRef *dst);

  void blendFrames(const VSFrameRef *src1, const VSFrameRef *src2, VSFrameRef *dst,
    double amount1);
  void calcBlendRatios(double &amount1, double &amount2, int &frame1, int &frame2, int n,
    int bframe, int cycleI);

  const VSFrameRef *GetFrameMode01(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core);
  const VSFrameRef *GetFrameMode2(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core);
  const VSFrameRef *GetFrameMode3(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core);
  const VSFrameRef *GetFrameMode4(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core);
  const VSFrameRef *GetFrameMode56(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core);
  const VSFrameRef *GetFrameMode7(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core);
  void getOvrFrame(int n, uint64_t &metricU, uint64_t &metricF) const;
  void getOvrCycle(Cycle &current, bool mode2);
  void displayOutput(VSFrameRef *dst, int n,
    int ret, bool film, double amount1, double amount2, int f1, int f2);
  void formatMetrics(Cycle &current);
  void formatDups(Cycle &current);
  void formatDecs(std::string &buf, Cycle &current);
  void formatMatches(Cycle &current);
  void formatMatches(Cycle &current, Cycle &previous);
  void debugOutput1(int n, bool film, int blend);
  void debugOutput2(int n, int ret, bool film, int f1, int f2, double amount1, double amount2);
  void addMetricCycle(const Cycle &j);
  bool checkForObviousDecFrame(Cycle &p, Cycle &c, Cycle &n);
  void mostSimilarDecDecision(Cycle &p, Cycle &c, Cycle &n);
  int checkForD2VDecFrame(Cycle &p, Cycle &c, Cycle &n);
  bool checkForTwoDropLongestString(Cycle &p, Cycle &c, Cycle &n);
  int getNonDecMode2(int n, int start, int stop) const;
  double buildDecStrategy();
  void mode2MarkDecFrames(int cycleF);
  void removeMinN(int m, int n, int start, int stop);
  void removeMinN(int m, int n, uint64_t *metricsT, int *orderT, int &ovrC);
  int findDivisor(double decRatio, int min_den) const;
  int findNumerator(double decRatio, int divisor) const;
  double findCorrectionFactors(double decRatio, int num, int den, int rc[10]) const;
  void sortMetrics(uint64_t *metrics, int *order, int length) const;
  //void SedgeSort(uint64_t *metrics, int *order, int length);
  //void pQuickerSort(uint64_t *metrics, int *order, int lower, int upper);
  void calcMetricCycle(Cycle &current, bool scene, bool hnt, VSCore *core, VSFrameContext *frameCtx=nullptr) const;
  uint64_t calcMetric(const VSFrameRef *prevt, const VSFrameRef *currt, const VSVideoInfo *vi, int &blockNI,
    int &xblocksI, uint64_t &metricF, bool scene, VSCore *core) const;


  void calcBlendRatios2(double &amount1, double &amount2, int &frame1,
    int &frame2, int tf, Cycle &p, Cycle &c, Cycle &n, int remove);
  bool similar_group(int f1, int f2);
  bool same_group(int f1, int f2);
  bool diff_group(int f1, int f2);
  int diff_f(int f1, int f2);
  int mode7_analysis(int n) const;

  bool wasChosen(int i, int n);
  void calcMetricPreBuf(int n1, int n2, int pos, const VSVideoInfo *vit, bool scene, bool gethint, VSFrameContext *frameCtx, VSCore *core);
public:
  VSVideoInfo vi;

  const VSFrameRef *GetFrame(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core);
  TDecimate(VSNodeRef *_child, int _mode, int _cycleR, int _cycle, double _rate,
    double _dupThresh, double _vidThresh, double _sceneThresh, int _hybrid,
    int _vidDetect, int _conCycle, int _conCycleTP, const char* _ovr,
    const char* _output, const char* _input, const char* _tfmIn, const char* _mkvOut,
    int _nt, int _blockx, int _blocky, bool _debug, bool _display, int _vfrDec,
    bool _batch, bool _tcfv1, bool _se, bool _chroma, bool _exPP, int _maxndl,
    bool _m2PA, bool _predenoise, bool _noblend, bool _ssd, bool _usehints,
    VSNodeRef *_clip2, int _sdlim, int _opt, const char* _orgOut, const VSAPI *_vsapi, VSCore *core);
  ~TDecimate();

//  int __stdcall SetCacheHints(int cachehints, int frame_range) override {
//    return cachehints == CACHE_GET_MTMODE ? MT_SERIALIZED : 0;
//  }
};

#endif // TDECIMATE_H
07070100000009000081A4000000000000000000000001671240C90000E95C000000000000000000000000000000000000003400000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateASM.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include "TDecimateASM.h"
#include "TCommonASM.h"
#include "emmintrin.h"
#include "smmintrin.h" // SSE4
#include <assert.h>

static void blend_uint8_c(uint8_t* dstp, const uint8_t* srcp1,
  const uint8_t* srcp2, int width, int height, int dst_pitch,
  int src1_pitch, int src2_pitch, int weight_i)
{
  // weight_i is 16 bit scaled
  assert(weight_i != 0 && weight_i != 65536);

  const int invweight_i = 65536 - weight_i;

  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      dstp[x] = (weight_i * srcp1[x] + invweight_i * srcp2[x] + 32768) >> 16;
    }
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }
}

static void blend_uint16_c(uint8_t* dstp, const uint8_t* srcp1,
  const uint8_t* srcp2, int width, int height, int dst_pitch,
  int src1_pitch, int src2_pitch, int weight_i, int bits_per_pixel)
{
  // weight_i is 15 bit scaled
  // min and max cases handled earlier
  assert(weight_i != 0 && weight_i != 32768);

  const int max_pixel_value = (1 << bits_per_pixel) - 1;
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      const int src1 = reinterpret_cast<const uint16_t*>(srcp1)[x];
      const int src2 = reinterpret_cast<const uint16_t*>(srcp2)[x];
      const int result = src2 + (((src1 - src2) * weight_i + 16384) >> 15);
      reinterpret_cast<uint16_t*>(dstp)[x] = std::max(std::min(result, max_pixel_value), 0);
      //  (reinterpret_cast<const uint16_t*>(srcp1)[x] * weight_i + reinterpret_cast<const uint16_t*>(srcp2)[x] * invweight_i + 16384) >> 15;
    }
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
    dstp += dst_pitch;
  }
}

static void blend_uint8_SSE2(uint8_t* dstp, const uint8_t* srcp1,
  const uint8_t* srcp2, int width, int height, int dst_pitch,
  int src1_pitch, int src2_pitch, int weight_i)
{
  // weight_i is 16 bit scaled
  assert(weight_i != 0 && weight_i != 65536);
  // 0 and max weights are handled earlier
  __m128i iw1 = _mm_set1_epi16((short)weight_i);
  __m128i iw2 = _mm_set1_epi16((short)(65536 - weight_i));
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 + x));
      __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 + x));
      __m128i src1_lo = _mm_unpacklo_epi8(src1, src1);
      __m128i src2_lo = _mm_unpacklo_epi8(src2, src2);
      __m128i src1_hi = _mm_unpackhi_epi8(src1, src1);
      __m128i src2_hi = _mm_unpackhi_epi8(src2, src2);
      // small note: mulhi does not round. difference from C
      // mulhi: instead of >> 16 we get the hi16 bit immediately
      __m128i mulres_lo = _mm_adds_epu16(_mm_mulhi_epu16(src1_lo, iw1), _mm_mulhi_epu16(src2_lo, iw2));
      __m128i mulres_hi = _mm_adds_epu16(_mm_mulhi_epu16(src1_hi, iw1), _mm_mulhi_epu16(src2_hi, iw2));

      mulres_lo = _mm_srli_epi16(mulres_lo, 8);
      mulres_hi = _mm_srli_epi16(mulres_hi, 8);

      __m128i res = _mm_packus_epi16(mulres_lo, mulres_hi);
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    dstp += dst_pitch;
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
  }
}


template<bool lessThan16bits>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("sse4.1")))
#endif 
static void blend_uint16_SSE4(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2,
  int width, int height,
  int dst_pitch, int src1_pitch, int src2_pitch, int weight_i, int bits_per_pixel)
{
  assert(weight_i != 0 && weight_i != 32768);
  // full copy cases have to be handled earlier
  // 15 bit integer arithwetic
  auto round_mask = _mm_set1_epi32(0x4000); // 32768/2
  auto weight = _mm_set1_epi32(weight_i);
  auto zero = _mm_setzero_si128();

  const int max_pixel_value = (1 << bits_per_pixel) - 1;
  auto max_pixel_value_128 = _mm_set1_epi16((short)max_pixel_value);

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < width * (int)sizeof(uint16_t); x += 16) {
      auto src1 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp1 + x));
      auto src2 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp2 + x));

      auto src1_lo = _mm_unpacklo_epi16(src1, zero);
      auto src1_hi = _mm_unpackhi_epi16(src1, zero);

      auto src2_lo = _mm_unpacklo_epi16(src2, zero);
      auto src2_hi = _mm_unpackhi_epi16(src2, zero);

      // return src2 +(((src1 - src2) * weight_15bits + round) >> 15);

      auto diff_lo = _mm_sub_epi32(src1_lo, src2_lo);
      auto diff_hi = _mm_sub_epi32(src1_hi, src2_hi);

      auto lerp_lo = _mm_mullo_epi32(diff_lo, weight);
      auto lerp_hi = _mm_mullo_epi32(diff_hi, weight);

      lerp_lo = _mm_srai_epi32(_mm_add_epi32(lerp_lo, round_mask), 15);
      lerp_hi = _mm_srai_epi32(_mm_add_epi32(lerp_hi, round_mask), 15);

      auto result_lo = _mm_add_epi32(src2_lo, lerp_lo);
      auto result_hi = _mm_add_epi32(src2_hi, lerp_hi);

      auto result = _mm_packus_epi32(result_lo, result_hi);
      if constexpr(lessThan16bits) // otherwise no clamp needed
        result = _mm_min_epu16(result, max_pixel_value_128);

      _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x), result);
    }

    dstp += dst_pitch;
    srcp1 += src1_pitch;
    srcp2 += src2_pitch;
  }
}

// handles 50% special case as well
// hbd ready
void dispatch_blend(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height,
  int dst_pitch, int src1_pitch, int src2_pitch, int weight_i, int bits_per_pixel, const CPUFeatures *cpuFlags)
{
  const bool use_sse2 = cpuFlags->sse2;
  const bool use_sse4 = cpuFlags->sse4_1;

  // weight_i 0 and max --> copy is already handled!
  // weight_i is of 15 bit scale

  // special 50% case
  if (weight_i == 32768 / 2) {
    if (bits_per_pixel == 8) {
      if (use_sse2)
        blend_5050_SSE2<uint8_t>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch);
      else
        blend_5050_c<uint8_t>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch);
    }
    else {
      if (use_sse2)
        blend_5050_SSE2<uint16_t>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch);
      else
        blend_5050_c<uint16_t>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch);
    }
    return;
  }

  // arbitrary blend
  if (bits_per_pixel == 8) {
    // using 16 bit scaled values inside instead of 15 bit scaled
    if(use_sse2)
      blend_uint8_SSE2(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch, weight_i * 2);
    else
      blend_uint8_c(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch, weight_i * 2);
    return;
  }

  // 10-16 bits
  if (use_sse4) {
    if (bits_per_pixel < 16)
      blend_uint16_SSE4<true>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch, weight_i, bits_per_pixel);
    else
      blend_uint16_SSE4<false>(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch, weight_i, bits_per_pixel);
  }
  else {
    blend_uint16_c(dstp, srcp1, srcp2, width, height, dst_pitch, src1_pitch, src2_pitch, weight_i, bits_per_pixel);
  }
}


void calcLumaDiffYUY2SAD_SSE2_16(const uint8_t *prvp, const uint8_t *nxtp,
  int width, int height, int prv_pitch, int nxt_pitch, uint64_t &sad)
{
  sad = 0; 
  __m128i sum = _mm_setzero_si128();
  const __m128i lumaMask = _mm_set1_epi16(0x00FF);
  while (height--) {
    for (int x = 0; x < width; x += 16)
    {
      __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x));
      __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(nxtp + x));
      src1 = _mm_and_si128(src1, lumaMask);
      src2 = _mm_and_si128(src2, lumaMask);
      __m128i tmp = _mm_sad_epu8(src1, src2);
      sum = _mm_add_epi64(sum, tmp);
    }
    prvp += prv_pitch;
    nxtp += nxt_pitch;
  }
  sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8)); // add lo, hi
  _mm_storel_epi64(reinterpret_cast<__m128i*>(&sad), sum);
}


void calcLumaDiffYUY2SSD_SSE2_16(const uint8_t *prvp, const uint8_t *nxtp,
  int width, int height, int prv_pitch, int nxt_pitch, uint64_t &ssd)
{
  ssd = 0; // sum of squared differences
  const __m128i lumaMask = _mm_set1_epi16(0x00FF);
  while (height--) {
    __m128i zero = _mm_setzero_si128();
    __m128i rowsum = _mm_setzero_si128(); // pxor xmm6, xmm6

    for (int x = 0; x < width; x += 16)
    {
      __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x)); // movdqa tmp, [edi + eax]
      __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(nxtp + x)); // movdqa xmm1, [esi + eax]
      __m128i diff12 = _mm_subs_epu8(src1, src2);
      __m128i diff21 = _mm_subs_epu8(src2, src1);
      __m128i tmp = _mm_or_si128(diff12, diff21);
      tmp = _mm_and_si128(tmp, lumaMask);
      tmp = _mm_madd_epi16(tmp, tmp);
      rowsum = _mm_add_epi32(rowsum, tmp);
    }
    __m128i sum_lo = _mm_unpacklo_epi32(rowsum, zero); // punpckldq xmm6, xmm5
    __m128i sum_hi = _mm_unpackhi_epi32(rowsum, zero); // punpckhdq tmp, xmm5
    __m128i sum = _mm_add_epi64(sum_lo, sum_hi); // paddq xmm6, tmp

    __m128i res = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&ssd)); // movq xmm1, qword ptr[eax]
    // low 64
    res = _mm_add_epi64(res, sum);
    // high 64
    res = _mm_add_epi64(res, _mm_srli_si128(sum, 8));
    _mm_storel_epi64(reinterpret_cast<__m128i*>(&ssd), res);
    prvp += prv_pitch;
    nxtp += nxt_pitch;
  }
}

template<int blkSizeY>
void calcSAD_SSE2_16xN(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int& sad)
{
  assert(0 == blkSizeY % 8);

  __m128i tmpsum = _mm_setzero_si128();
  // unrolled loop
  for (int i = 0; i < blkSizeY / 8; i++) {
    __m128i xmm0, xmm1;
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp2 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp3 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp4 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_add_epi32(tmp1, tmp2);
    xmm1 = _mm_add_epi32(tmp3, tmp4);
    tmpsum = _mm_add_epi32(tmpsum, xmm0);
    tmpsum = _mm_add_epi32(tmpsum, xmm1);
  }
  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  sad = _mm_cvtsi128_si32(sum);
}

// only 411 uses
template<int blkSizeY>
void calcSAD_C_2xN(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int& sad)
{
  int tmpsum = 0;
  for (int i = 0; i < blkSizeY; i++) {
    tmpsum += abs(ptr1[0] - ptr2[0]);
    tmpsum += abs(ptr1[1] - ptr2[1]);
    ptr1 += pitch1;
    ptr2 += pitch2;
  }

  sad = tmpsum;
}

template<int blkSizeY>
void calcSSD_C_2xN(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int& sad)
{
  int tmpsum = 0;
  for (int i = 0; i < blkSizeY; i++) {
    const int tmp0 = ptr1[0] - ptr2[0];
    const int tmp1 = ptr1[1] - ptr2[1];
    tmpsum += tmp0 * tmp0 + tmp1 * tmp1;
    ptr1 += pitch1;
    ptr2 += pitch2;
  }

  sad = tmpsum;
}


template<int blkSizeY>
void calcSAD_SSE2_4xN(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &sad)
{
  assert(0 == blkSizeY % 4);

  __m128i tmpsum = _mm_setzero_si128();
  // unrolled loop
  for (int i = 0; i < blkSizeY / 4; i++) {
    __m128i xmm0, xmm1;
    xmm0 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr1)));
    xmm1 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr1 + pitch1)));
    xmm0 = _mm_sad_epu8(xmm0, _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr2))));
    xmm1 = _mm_sad_epu8(xmm1, _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr2 + pitch2))));
    __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr1)));
    xmm1 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr1 + pitch1)));
    xmm0 = _mm_sad_epu8(xmm0, _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr2))));
    xmm1 = _mm_sad_epu8(xmm1, _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr2 + pitch2))));
    __m128i tmp2 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_add_epi32(tmp1, tmp2);
    tmpsum = _mm_add_epi32(tmpsum, xmm0);
  }

  sad = _mm_cvtsi128_si32(tmpsum); // we have only lo
}

template<int blkSizeY>
void calcSAD_SSE2_8xN(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &sad)
{
  assert(0 == blkSizeY % 8);

  __m128i tmpsum = _mm_setzero_si128();
  // blkSizeY should be multiple of 8
  // unrolled loop
  for (int i = 0; i < blkSizeY / 8; i++) {
    __m128i xmm0, xmm1;
    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp2 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp3 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm0 = _mm_sad_epu8(xmm0, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    __m128i tmp4 = _mm_add_epi32(xmm0, xmm1);
    ptr1 += pitch1 * 2; // if last, no need more, hope compiler solves it
    ptr2 += pitch2 * 2;

    xmm0 = _mm_add_epi32(tmp1, tmp2);
    xmm1 = _mm_add_epi32(tmp3, tmp4);
    tmpsum = _mm_add_epi32(tmpsum, xmm0);
    tmpsum = _mm_add_epi32(tmpsum, xmm1);
  }

  sad = _mm_cvtsi128_si32(tmpsum); // we have only lo
}

// new
void calcSAD_SSE2_8x8_YUY2_lumaonly(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &sad)
{
  __m128i tmpsum = _mm_setzero_si128();
  const __m128i lumaMask = _mm_set1_epi16(0x00FF);
  // unrolled loop
  __m128i xmm0, xmm1;
  xmm0 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1)), lumaMask);
  xmm1 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1)), lumaMask);
  xmm0 = _mm_sad_epu8(xmm0, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2)), lumaMask));
  xmm1 = _mm_sad_epu8(xmm1, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2)), lumaMask));
  __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
  ptr1 += pitch1 * 2;
  ptr2 += pitch2 * 2;

  xmm0 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1)), lumaMask);
  xmm1 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1)), lumaMask);
  xmm0 = _mm_sad_epu8(xmm0, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2)), lumaMask));
  xmm1 = _mm_sad_epu8(xmm1, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2)), lumaMask));
  __m128i tmp2 = _mm_add_epi32(xmm0, xmm1);
  ptr1 += pitch1 * 2;
  ptr2 += pitch2 * 2;

  xmm0 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1)), lumaMask);
  xmm1 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1)), lumaMask);
  xmm0 = _mm_sad_epu8(xmm0, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2)), lumaMask));
  xmm1 = _mm_sad_epu8(xmm1, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2)), lumaMask));
  __m128i tmp3 = _mm_add_epi32(xmm0, xmm1);
  ptr1 += pitch1 * 2;
  ptr2 += pitch2 * 2;

  xmm0 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1)), lumaMask);
  xmm1 = _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1)), lumaMask);
  xmm0 = _mm_sad_epu8(xmm0, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2)), lumaMask));
  xmm1 = _mm_sad_epu8(xmm1, _mm_and_si128(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2)), lumaMask));
  __m128i tmp4 = _mm_add_epi32(xmm0, xmm1);
  // ptr1 += pitch1 * 2; // last one, no need more 
  // ptr2 += pitch2 * 2;

  xmm0 = _mm_add_epi32(tmp1, tmp2);
  xmm1 = _mm_add_epi32(tmp3, tmp4);
  tmpsum = _mm_add_epi32(tmpsum, xmm0);
  tmpsum = _mm_add_epi32(tmpsum, xmm1);

  sad = _mm_cvtsi128_si32(tmpsum); // we have only lo
}

// really YUY2 16x16 with chroma
void calcSAD_SSE2_32x16(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int& sad)
{
  __m128i tmpsum = _mm_setzero_si128();
  // unrolled loop 4 lines
  for (int i = 0; i < 16 / 4; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1 + 16));

    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + 16)));
    xmm2 = _mm_sad_epu8(xmm2, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    xmm3 = _mm_sad_epu8(xmm3, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2 + 16)));

    __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
    __m128i tmp2 = _mm_add_epi32(xmm2, xmm3);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i*>(ptr1 + pitch1 + 16));
    xmm0 = _mm_sad_epu8(xmm0, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2)));
    xmm1 = _mm_sad_epu8(xmm1, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + 16)));
    xmm2 = _mm_sad_epu8(xmm2, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2)));
    xmm3 = _mm_sad_epu8(xmm3, _mm_load_si128(reinterpret_cast<const __m128i*>(ptr2 + pitch2 + 16)));

    __m128i tmp3 = _mm_add_epi32(xmm0, xmm1);
    __m128i tmp4 = _mm_add_epi32(xmm2, xmm3);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_add_epi32(tmp1, tmp2);
    xmm1 = _mm_add_epi32(tmp3, tmp4);
    tmpsum = _mm_add_epi32(tmpsum, xmm0);
    tmpsum = _mm_add_epi32(tmpsum, xmm1);
  }
  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  sad = _mm_cvtsi128_si32(sum);
}

// really YUY2 16x16 no chroma
void calcSAD_SSE2_32x16_YUY2_lumaonly(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &sad)
{
  __m128i tmpsum = _mm_setzero_si128();
  // unrolled loop
  const __m128i luma = _mm_set1_epi16(0x00FF);

  for (int i = 0; i < 16/2; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    xmm0 = _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr1)), luma);
    xmm1 = _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + 16)), luma);
    xmm2 = _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1)), luma);
    xmm3 = _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1 + 16)), luma);
    
    xmm0 = _mm_sad_epu8(xmm0, _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr2)), luma));
    xmm1 = _mm_sad_epu8(xmm1, _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + 16)), luma));
    xmm2 = _mm_sad_epu8(xmm2, _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2)), luma));
    xmm3 = _mm_sad_epu8(xmm3, _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2 + 16)), luma));

    __m128i tmp1 = _mm_add_epi32(xmm0, xmm1);
    __m128i tmp2 = _mm_add_epi32(xmm2, xmm3);
    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;

    xmm0 = _mm_add_epi32(tmp1, tmp2);
    tmpsum = _mm_add_epi32(tmpsum, xmm0);
  }
  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  sad = _mm_cvtsi128_si32(sum);
}


template<int blkSizeY>
void calcSSD_SSE2_4xN(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  assert(0 == blkSizeY % 2);

  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  // two lines at a time -> 4 = 2x2
  for (int i = 0; i < blkSizeY / 2; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1, tmp0lo, tmp1lo;
    // two lines, 4 byte / 32 bit loads
    xmm0 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float*>(ptr1)));
    xmm1 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float *>(ptr1 + pitch1)));
    xmm2 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float *>(ptr2)));
    xmm3 = _mm_castps_si128(_mm_load_ss(reinterpret_cast<const float *>(ptr2 + pitch2)));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0)); // only low 4 bytes are valid
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    tmp0lo = _mm_unpacklo_epi8(tmp0, zero); // only low 8 bytes (4 words, 64 bits) are valid
    tmp0lo = _mm_madd_epi16(tmp0lo, tmp0lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp0lo);

    tmp1lo = _mm_unpacklo_epi8(tmp1, zero);
    tmp1lo = _mm_madd_epi16(tmp1lo, tmp1lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp1lo);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  // we have only lo64 in tmpsum
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  //__m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero);
  tmpsum = sum64lo;

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}

template<int blkSizeY>
void calcSSD_SSE2_8xN(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  assert(0 == blkSizeY % 2);

  // even blkSize Y 8x8, 8x16
  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  // two lines at a time -> 8 = 4x2
  for (int i = 0; i < blkSizeY / 2; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1, tmp0lo, tmp1lo;
    // two lines, only lower 8 bytes
    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1));
    xmm2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2));
    xmm3 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2));

    // abs diff
    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    tmp0lo = _mm_unpacklo_epi8(tmp0, zero);
    tmp0lo = _mm_madd_epi16(tmp0lo, tmp0lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp0lo);

    tmp1lo = _mm_unpacklo_epi8(tmp1, zero);
    tmp1lo = _mm_madd_epi16(tmp1lo, tmp1lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp1lo);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  __m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero);
  tmpsum = _mm_add_epi64(sum64lo, sum64hi);

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}

void calcSSD_SSE2_8x8_YUY2_lumaonly(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  const __m128i lumaMask = _mm_set1_epi16(0x00FF);
  // two lines at a time -> 4x2
  for (int i = 0; i < 4; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1;
    // two lines
    xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1));
    xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr1 + pitch1));
    xmm2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2));
    xmm3 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr2 + pitch2));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    // luma:
    tmp0 = _mm_and_si128(tmp0, lumaMask); // no need to unpack, we have 00XX after masking
    tmp1 = _mm_and_si128(tmp1, lumaMask);

    tmp0 = _mm_madd_epi16(tmp0, tmp0);
    tmpsum = _mm_add_epi32(tmpsum, tmp0);

    tmp1 = _mm_madd_epi16(tmp1, tmp1);
    tmpsum = _mm_add_epi32(tmpsum, tmp1);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  // we have only lo64 in tmpsum
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  //__m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero); 
  tmpsum = sum64lo;

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}


template<int blkSizeY>
void calcSSD_SSE2_16xN(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  assert(0 == blkSizeY % 2);

  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  // two lines at a time -> 16 = 8x2
  for (int i = 0; i < blkSizeY / 2; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1, tmp0lo, tmp0hi, tmp1lo, tmp1hi;
    // two lines
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    tmp0lo = _mm_unpacklo_epi8(tmp0, zero);
    tmp0hi = _mm_unpackhi_epi8(tmp0, zero);
    tmp0lo = _mm_madd_epi16(tmp0lo, tmp0lo);
    tmp0hi = _mm_madd_epi16(tmp0hi, tmp0hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp0lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp0hi);

    tmp1lo = _mm_unpacklo_epi8(tmp1, zero);
    tmp1hi = _mm_unpackhi_epi8(tmp1, zero);
    tmp1lo = _mm_madd_epi16(tmp1lo, tmp1lo);
    tmp1hi = _mm_madd_epi16(tmp1hi, tmp1hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp1lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp1hi);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  __m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero);
  tmpsum = _mm_add_epi64(sum64lo, sum64hi);

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}

// instantiate
template void calcSSD_SSE2_16xN<16>(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &ssd);
template void calcSSD_SSE2_8xN<16>(const uint8_t* ptr1, const uint8_t* ptr2, int pitch1, int pitch2, int& ssd);
template void calcSSD_SSE2_8xN<8>(const uint8_t* ptr1, const uint8_t* ptr2, int pitch1, int pitch2, int& ssd);
template void calcSSD_SSE2_4xN<4>(const uint8_t* ptr1, const uint8_t* ptr2, int pitch1, int pitch2, int& ssd);
template void calcSSD_SSE2_4xN<8>(const uint8_t* ptr1, const uint8_t* ptr2, int pitch1, int pitch2, int& ssd);
template void calcSSD_SSE2_4xN<16>(const uint8_t* ptr1, const uint8_t* ptr2, int pitch1, int pitch2, int& ssd);

// YUY2 16x16 luma+chroma
void calcSSD_SSE2_32x16(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  // unrolled loop 8x2
  for (int i = 0; i < 8; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1, tmp0lo, tmp0hi, tmp1lo, tmp1hi;
    // unroll#1
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + 16));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    tmp0lo = _mm_unpacklo_epi8(tmp0, zero);
    tmp0hi = _mm_unpackhi_epi8(tmp0, zero);
    tmp0lo = _mm_madd_epi16(tmp0lo, tmp0lo);
    tmp0hi = _mm_madd_epi16(tmp0hi, tmp0hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp0lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp0hi);

    tmp1lo = _mm_unpacklo_epi8(tmp1, zero);
    tmp1hi = _mm_unpackhi_epi8(tmp1, zero);
    tmp1lo = _mm_madd_epi16(tmp1lo, tmp1lo);
    tmp1hi = _mm_madd_epi16(tmp1hi, tmp1hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp1lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp1hi);
    // unroll#2
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2 + 16));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    tmp0lo = _mm_unpacklo_epi8(tmp0, zero);
    tmp0hi = _mm_unpackhi_epi8(tmp0, zero);
    tmp0lo = _mm_madd_epi16(tmp0lo, tmp0lo);
    tmp0hi = _mm_madd_epi16(tmp0hi, tmp0hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp0lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp0hi);

    tmp1lo = _mm_unpacklo_epi8(tmp1, zero);
    tmp1hi = _mm_unpackhi_epi8(tmp1, zero);
    tmp1lo = _mm_madd_epi16(tmp1lo, tmp1lo);
    tmp1hi = _mm_madd_epi16(tmp1hi, tmp1hi);
    tmpsum = _mm_add_epi32(tmpsum, tmp1lo);
    tmpsum = _mm_add_epi32(tmpsum, tmp1hi);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  __m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero);
  tmpsum = _mm_add_epi64(sum64lo, sum64hi);

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}

void calcSSD_SSE2_32x16_YUY2_lumaonly(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int &ssd)
{
  __m128i tmpsum = _mm_setzero_si128();
  __m128i zero = _mm_setzero_si128();
  const __m128i lumaMask = _mm_set1_epi16(0x00FF);
  // unrolled loop 8x2
  for (int i = 0; i < 16/2; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    __m128i tmp0, tmp1;
    // unroll#1
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + 16));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    // luma:
    tmp0 = _mm_and_si128(tmp0, lumaMask); // no need to unpack, we have 00XX after masking
    tmp1 = _mm_and_si128(tmp1, lumaMask);

    tmp0 = _mm_madd_epi16(tmp0, tmp0);
    tmpsum = _mm_add_epi32(tmpsum, tmp0);

    tmp1 = _mm_madd_epi16(tmp1, tmp1);
    tmpsum = _mm_add_epi32(tmpsum, tmp1);
    // unroll#2
    xmm0 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1));
    xmm1 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr1 + pitch1 + 16));
    xmm2 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2));
    xmm3 = _mm_load_si128(reinterpret_cast<const __m128i *>(ptr2 + pitch2 + 16));

    tmp0 = _mm_or_si128(_mm_subs_epu8(xmm0, xmm2), _mm_subs_epu8(xmm2, xmm0));
    tmp1 = _mm_or_si128(_mm_subs_epu8(xmm1, xmm3), _mm_subs_epu8(xmm3, xmm1));

    // luma:
    tmp0 = _mm_and_si128(tmp0, lumaMask);
    tmp1 = _mm_and_si128(tmp1, lumaMask);

    tmp0 = _mm_madd_epi16(tmp0, tmp0);
    tmpsum = _mm_add_epi32(tmpsum, tmp0);

    tmp1 = _mm_madd_epi16(tmp1, tmp1);
    tmpsum = _mm_add_epi32(tmpsum, tmp1);

    ptr1 += pitch1 * 2;
    ptr2 += pitch2 * 2;
  }
  __m128i sum64lo = _mm_unpacklo_epi32(tmpsum, zero); // move to 64 bit boundary
  __m128i sum64hi = _mm_unpackhi_epi32(tmpsum, zero);
  tmpsum = _mm_add_epi64(sum64lo, sum64hi);

  __m128i sum = _mm_add_epi32(tmpsum, _mm_srli_si128(tmpsum, 8)); // add lo, hi
  ssd = _mm_cvtsi128_si32(sum);
}

// always mod 8, sse2 unaligned!
void HorizontalBlurSSE2_Planar_R(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
  int dst_pitch, int width, int height)
{
  __m128i two = _mm_set1_epi16(0x0002); // rounder
  __m128i zero = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 8) {
      // we have -1/+1 here, cannot be called for leftmost/rightmost blocks
      __m128i left = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x - 1));
      __m128i center = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x));
      __m128i right = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x + 1));
      __m128i left_lo = _mm_unpacklo_epi8(left, zero);
      __m128i center_lo = _mm_unpacklo_epi8(center, zero);
      __m128i right_lo = _mm_unpacklo_epi8(right, zero);
      __m128i left_hi = _mm_unpackhi_epi8(left, zero);
      __m128i center_hi = _mm_unpackhi_epi8(center, zero);
      __m128i right_hi = _mm_unpackhi_epi8(right, zero);

      // (center*2 + left + right + 2) >> 2
      __m128i centermul2_lo = _mm_slli_epi16(center_lo, 1);
      __m128i centermul2_hi = _mm_slli_epi16(center_hi, 1);
      auto res_lo = _mm_add_epi16(_mm_add_epi16(centermul2_lo, left_lo), right_lo);
      auto res_hi = _mm_add_epi16(_mm_add_epi16(centermul2_hi, left_hi), right_hi);
      res_lo = _mm_srli_epi16(_mm_add_epi16(res_lo, two), 2); // +2, / 4
      res_hi = _mm_srli_epi16(_mm_add_epi16(res_hi, two), 2);
      __m128i res = _mm_packus_epi16(res_lo, res_hi);
      _mm_storel_epi64(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


void HorizontalBlurSSE2_YUY2_R_luma(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
  int dst_pitch, int width, int height)
{
  __m128i two = _mm_set1_epi16(0x0002); // rounder
  __m128i zero = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 8) {
      // we have -2/+2 here, cannot be called for leftmost/rightmost blocks
      // same as planar case but +/-2 instead of +/-1
      __m128i left = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x - 2));
      __m128i center = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x));
      __m128i right = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x + 2));
      __m128i left_lo = _mm_unpacklo_epi8(left, zero);
      __m128i center_lo = _mm_unpacklo_epi8(center, zero);
      __m128i right_lo = _mm_unpacklo_epi8(right, zero);
      __m128i left_hi = _mm_unpackhi_epi8(left, zero);
      __m128i center_hi = _mm_unpackhi_epi8(center, zero);
      __m128i right_hi = _mm_unpackhi_epi8(right, zero);

      // (center*2 + left + right + 2) >> 2
      __m128i centermul2_lo = _mm_slli_epi16(center_lo, 1);
      __m128i centermul2_hi = _mm_slli_epi16(center_hi, 1);
      auto res_lo = _mm_add_epi16(_mm_add_epi16(centermul2_lo, left_lo), right_lo);
      auto res_hi = _mm_add_epi16(_mm_add_epi16(centermul2_hi, left_hi), right_hi);
      res_lo = _mm_srli_epi16(_mm_add_epi16(res_lo, two), 2); // +2, / 4
      res_hi = _mm_srli_epi16(_mm_add_epi16(res_hi, two), 2);
      __m128i res = _mm_packus_epi16(res_lo, res_hi);

      _mm_storel_epi64(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


// mod 8 always, unaligned
void HorizontalBlurSSE2_YUY2_R(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
  int dst_pitch, int width, int height)
{
  __m128i two = _mm_set1_epi16(2); // rounder
  __m128i zero = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 8) {
      // luma part
      // we have -2/+2 here, cannot be called for leftmost/rightmost blocks
      // same as Y12 but +/-2 instead of +/-1
      __m128i left = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x - 2));
      __m128i center = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x));
      __m128i right = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x + 2));
      __m128i left_lo = _mm_unpacklo_epi8(left, zero);
      __m128i center_lo = _mm_unpacklo_epi8(center, zero);
      __m128i right_lo = _mm_unpacklo_epi8(right, zero);
      __m128i left_hi = _mm_unpackhi_epi8(left, zero);
      __m128i center_hi = _mm_unpackhi_epi8(center, zero);
      __m128i right_hi = _mm_unpackhi_epi8(right, zero);

      // (center*2 + left + right + 2) >> 2
      __m128i centermul2_lo = _mm_slli_epi16(center_lo, 1);
      __m128i centermul2_hi = _mm_slli_epi16(center_hi, 1);
      auto res_lo = _mm_add_epi16(_mm_add_epi16(centermul2_lo, left_lo), right_lo);
      auto res_hi = _mm_add_epi16(_mm_add_epi16(centermul2_hi, left_hi), right_hi);
      res_lo = _mm_srli_epi16(_mm_add_epi16(res_lo, two), 2); // +2, / 4
      res_hi = _mm_srli_epi16(_mm_add_epi16(res_hi, two), 2);
      __m128i res1 = _mm_packus_epi16(res_lo, res_hi);

      // YUY2 chroma part
      // same as Planar but +/-2 instead of +/-1
      // we have already filled center 
      left = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x - 4));
      right = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(srcp + x + 4));
      left_lo = _mm_unpacklo_epi8(left, zero);
      center_lo = _mm_unpacklo_epi8(center, zero);
      right_lo = _mm_unpacklo_epi8(right, zero);
      left_hi = _mm_unpackhi_epi8(left, zero);
      center_hi = _mm_unpackhi_epi8(center, zero);
      right_hi = _mm_unpackhi_epi8(right, zero);

      // (center*2 + left + right + 2) >> 2
      centermul2_lo = _mm_slli_epi16(center_lo, 1);
      centermul2_hi = _mm_slli_epi16(center_hi, 1);
      res_lo = _mm_add_epi16(_mm_add_epi16(centermul2_lo, left_lo), right_lo);
      res_hi = _mm_add_epi16(_mm_add_epi16(centermul2_hi, left_hi), right_hi);
      res_lo = _mm_srli_epi16(_mm_add_epi16(res_lo, two), 2); // +2, / 4
      res_hi = _mm_srli_epi16(_mm_add_epi16(res_hi, two), 2);
      __m128i res2 = _mm_packus_epi16(res_lo, res_hi);

      __m128i chroma_mask = _mm_set1_epi16((short)0xFF00);
      __m128i luma_mask = _mm_set1_epi16(0x00FF);

      res1 = _mm_and_si128(res1, luma_mask);
      res2 = _mm_and_si128(res1, chroma_mask);
      __m128i res = _mm_or_si128(res1, res2);

      _mm_storel_epi64(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


void VerticalBlurSSE2_R(const uint8_t *srcp, uint8_t *dstp,
  int src_pitch, int dst_pitch, int width, int height)
{
  __m128i two = _mm_set1_epi16(0x0002); // rounder
  __m128i zero = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      __m128i left = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + x - src_pitch));
      __m128i center = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + x));
      __m128i right = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + x + src_pitch));
      __m128i left_lo = _mm_unpacklo_epi8(left, zero);
      __m128i center_lo = _mm_unpacklo_epi8(center, zero);
      __m128i right_lo = _mm_unpacklo_epi8(right, zero);
      __m128i left_hi = _mm_unpackhi_epi8(left, zero);
      __m128i center_hi = _mm_unpackhi_epi8(center, zero);
      __m128i right_hi = _mm_unpackhi_epi8(right, zero);

      // (center*2 + left + right + 2) >> 2
      __m128i centermul2_lo = _mm_slli_epi16(center_lo, 1);
      __m128i centermul2_hi = _mm_slli_epi16(center_hi, 1);
      auto res_lo = _mm_add_epi16(_mm_add_epi16(centermul2_lo, left_lo), right_lo);
      auto res_hi = _mm_add_epi16(_mm_add_epi16(centermul2_hi, left_hi), right_hi);
      res_lo = _mm_srli_epi16(_mm_add_epi16(res_lo, two), 2); // +2, / 4
      res_hi = _mm_srli_epi16(_mm_add_epi16(res_hi, two), 2);
      __m128i res = _mm_packus_epi16(res_lo, res_hi);

      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}


//-------- helpers

// true SAD false SSD
template<bool SAD>
static void calcDiff_SADorSSD_32x32_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, const VSVideoInfo *vi)
{
    (void)chroma;

  int temp1, temp2, y, x, u, difft, box1, box2;
  int widtha, heighta, heights = height, widths = width;
  const uint8_t* ptr1T, * ptr2T;

    // from YV12 to generic planar
    const int xsubsampling = plane == 0 ? 0 : vi->format->subSamplingW;
    const int ysubsampling = plane == 0 ? 0 : vi->format->subSamplingH;
    // base: luma: 16x16, chroma: divided with subsampling
    const int w_to_shift = 4 - xsubsampling;
    const int h_to_shift = 4 - ysubsampling;
    // whole blocks
    heighta = (height >> h_to_shift) << h_to_shift; // mod16 for luma, mod8 or 16 for chroma
    widtha = (width >> w_to_shift) << w_to_shift; // mod16 for luma, mod8 or 16 for chroma
    height >>= h_to_shift; // whole blocks
    width >>= w_to_shift;

    using SAD_fn_t = decltype(calcSAD_SSE2_16xN<16>);
    SAD_fn_t* SAD_fn = nullptr;
    if constexpr (SAD) {
      if (xsubsampling == 0 && ysubsampling == 0) // YV24 or luma
        SAD_fn = calcSAD_SSE2_16xN<16>;
      else if (xsubsampling == 1 && ysubsampling == 0) // YV16
        SAD_fn = calcSAD_SSE2_8xN<16>;
      else if (xsubsampling == 1 && ysubsampling == 1) // YV12
        SAD_fn = calcSAD_SSE2_8xN<8>;
      else if (xsubsampling == 2 && ysubsampling == 0) // YV411
        SAD_fn = calcSAD_SSE2_4xN<16>;
    }
    else {
      if (xsubsampling == 0 && ysubsampling == 0) // YV24 or luma
        SAD_fn = calcSSD_SSE2_16xN<16>;
      else if (xsubsampling == 1 && ysubsampling == 0) // YV16
        SAD_fn = calcSSD_SSE2_8xN<16>;
      else if (xsubsampling == 1 && ysubsampling == 1) // YV12
        SAD_fn = calcSSD_SSE2_8xN<8>;
      else if (xsubsampling == 2 && ysubsampling == 0) // YV411
        SAD_fn = calcSSD_SSE2_4xN<16>;
    }
    // other formats are forbidden and were pre-checked

    // number of whole blocks
    for (y = 0; y < height; ++y)
    {
      // at other places:
      // for (y = 0; y < heighta; y += yhalf)
      //   const int temp1 = (y >> blocky_shift)*xblocks4;
      //   const int temp2 = ((y + blocky_half) >> blocky_shift) * xblocks4;
      // FIXME: why >>1 and +1>>1 here? 
      // Fact 1: y here goes in block-counter mode
      // Fact 2: Because we do 32x32 but with 16x16 luma (and divided chroma) blocks?
      temp1 = (y >> 1) * xblocks4;
      temp2 = ((y + 1) >> 1) * xblocks4;
      for (x = 0; x < width; ++x) // width is the number of blocks
      {
        SAD_fn(ptr1 + (x << w_to_shift), ptr2 + (x << w_to_shift), pitch1, pitch2, difft);
        box1 = (x >> 1) << 2;
        box2 = ((x + 1) >> 1) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      // rest non-simd
      for (x = widtha; x < widths; ++x)
      {
        ptr1T = ptr1;
        ptr2T = ptr2;
        for (difft = 0, u = 0; u < (1 << w_to_shift); ++u) // 16 or 8. u<blocksize
        {
          if constexpr (SAD)
            difft += abs(ptr1T[x] - ptr2T[x]);
          else
            difft += (ptr1T[x] - ptr2T[x]) * (ptr1T[x] - ptr2T[x]);
          ptr1T += pitch1;
          ptr2T += pitch2;
        }
        box1 = (x >> (w_to_shift + 1)) << 2;
        box2 = ((x + (1 << w_to_shift)) >> (w_to_shift + 1)) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      // += pitch1 * vertical blocksize
      ptr1 += pitch1 << h_to_shift;
      ptr2 += pitch2 << h_to_shift;
    }
    for (y = heighta; y < heights; ++y)
    {
      temp1 = (y >> (h_to_shift + 1)) * xblocks4; // y >> 5 or 4
      temp2 = ((y + (1 << h_to_shift)) >> (h_to_shift + 1)) * xblocks4;
      for (x = 0; x < widths; ++x)
      {
        if constexpr (SAD)
          difft = abs(ptr1[x] - ptr2[x]);
        else {
          difft = ptr1[x] - ptr2[x];
          difft *= difft;
        }
        box1 = (x >> (w_to_shift + 1)) << 2;
        box2 = ((x + (1 << w_to_shift)) >> (w_to_shift + 1)) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      ptr1 += pitch1;
      ptr2 += pitch2;
    }
}

void calcDiffSAD_32x32_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, const VSVideoInfo *vi)
{
  calcDiff_SADorSSD_32x32_SSE2<true>(ptr1, ptr2, pitch1, pitch2, width, height, plane, xblocks4, diff, chroma, vi);
}

void calcDiffSSD_32x32_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, const VSVideoInfo *vi)
{
  calcDiff_SADorSSD_32x32_SSE2<false>(ptr1, ptr2, pitch1, pitch2, width, height, plane, xblocks4, diff, chroma, vi);
}


// true: SAD, false: SSD
template<bool SAD>
void calcDiff_SADorSSD_Generic_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, const VSVideoInfo *vi)
{
    (void)chroma;

  int temp1, temp2, y, x, u, difft, box1, box2;
  int yshift, yhalf, xshift, xhalf;
  int heighta, heights = height, widtha, widths = width;
  int yshifta, yhalfa, xshifta, xhalfa;
  const uint8_t* ptr1T, * ptr2T;

    // from YV12 to generic planar
    const int xsubsampling = plane == 0 ? 0 : vi->format->subSamplingW;
    const int ysubsampling = plane == 0 ? 0 : vi->format->subSamplingH;
    // base: luma: 8x8, chroma: divided with subsampling
    const int w_to_shift = 3 - xsubsampling;
    const int h_to_shift = 3 - ysubsampling;
    // whole blocks
    heighta = (height >> h_to_shift) << h_to_shift; // mod16 for luma, mod8 or 4 for chroma
    widtha = (width >> w_to_shift) << w_to_shift; // mod16 for luma, mod8 or 4 for chroma
    height >>= h_to_shift;
    width >>= w_to_shift;

    using SAD_fn_t = decltype(calcSAD_SSE2_16xN<16>); // similar prototype for all X*Y
    SAD_fn_t* SAD_fn = nullptr;
    if constexpr (SAD) {
      if (xsubsampling == 0 && ysubsampling == 0) // YV24 or luma
        SAD_fn = calcSAD_SSE2_8xN<8>;
      else if (xsubsampling == 1 && ysubsampling == 0) // YV16
        SAD_fn = calcSAD_SSE2_4xN<8>;
      else if (xsubsampling == 1 && ysubsampling == 1) // YV12
        SAD_fn = calcSAD_SSE2_4xN<4>;
      else if (xsubsampling == 2 && ysubsampling == 0) // YV411
        SAD_fn = calcSAD_C_2xN<8>;
    }
    else {
      if (xsubsampling == 0 && ysubsampling == 0) // YV24 or luma
        SAD_fn = calcSSD_SSE2_8xN<8>;
      else if (xsubsampling == 1 && ysubsampling == 0) // YV16
        SAD_fn = calcSSD_SSE2_4xN<8>;
      else if (xsubsampling == 1 && ysubsampling == 1) // YV12
        SAD_fn = calcSSD_SSE2_4xN<4>;
      else if (xsubsampling == 2 && ysubsampling == 0) // YV411
        SAD_fn = calcSSD_C_2xN<8>;
    }
    // other formats are forbidden and were pre-checked

    yshifta = yshiftS - ysubsampling; // yshiftS  or yshiftS - 1
    yhalfa = yhalfS >> ysubsampling; // yhalfS  or yhalfS >> 1;
    xshifta = xshiftS - xsubsampling; //  xshiftS or  xshiftS - 1;
    xhalfa = xhalfS >> xsubsampling; // xhalfS  or xhalfS >> 1;
    // these are the same for luma and chroma as well, 8x8
    // FIXME: check, really? Really.
    yshift = yshiftS - 3;
    yhalf = yhalfS >> 3; // div 8
    xshift = xshiftS - 3;
    xhalf = xhalfS >> 3;
    for (y = 0; y < height; ++y)
    {
      temp1 = (y >> yshift) * xblocks4;
      temp2 = ((y + yhalf) >> yshift) * xblocks4;
      for (x = 0; x < width; ++x)
      {
        SAD_fn(ptr1 + (x << w_to_shift), ptr2 + (x << w_to_shift), pitch1, pitch2, difft);
        box1 = (x >> xshift) << 2;
        box2 = ((x + xhalf) >> xshift) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      for (x = widtha; x < widths; ++x)
      {
        ptr1T = ptr1;
        ptr2T = ptr2;
        for (difft = 0, u = 0; u < (1 << w_to_shift); ++u) // u < 8 or 4
        {
          if constexpr (SAD)
            difft += abs(ptr1T[x] - ptr2T[x]);
          else
            difft += (ptr1T[x] - ptr2T[x]) * (ptr1T[x] - ptr2T[x]);
          ptr1T += pitch1;
          ptr2T += pitch2;
        }
        box1 = (x >> xshifta) << 2;
        box2 = ((x + xhalfa) >> xshifta) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      ptr1 += pitch1 << h_to_shift;
      ptr2 += pitch2 << h_to_shift;
    }
    for (y = heighta; y < heights; ++y)
    {
      temp1 = (y >> yshifta) * xblocks4;
      temp2 = ((y + yhalfa) >> yshifta) * xblocks4;
      for (x = 0; x < widths; ++x)
      {
        if constexpr (SAD)
          difft = abs(ptr1[x] - ptr2[x]);
        else {
          difft = ptr1[x] - ptr2[x];
          difft *= difft;
        }
        box1 = (x >> xshifta) << 2;
        box2 = ((x + xhalfa) >> xshifta) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
      ptr1 += pitch1;
      ptr2 += pitch2;
    }
    // end of YV12 / planar
}

void calcDiffSAD_Generic_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, const VSVideoInfo *vi)
{
  calcDiff_SADorSSD_Generic_SSE2<true>(ptr1, ptr2, pitch1, pitch2, width, height, plane, xblocks4, diff, chroma, xshiftS, yshiftS, xhalfS, yhalfS, vi);
}

void calcDiffSSD_Generic_SSE2(const uint8_t* ptr1, const uint8_t* ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, const VSVideoInfo *vi)
{
  calcDiff_SADorSSD_Generic_SSE2<false>(ptr1, ptr2, pitch1, pitch2, width, height, plane, xblocks4, diff, chroma, xshiftS, yshiftS, xhalfS, yhalfS, vi);
}


// true: SAD, false: SSD
// inc: YUY2 increment
template<typename pixel_t, bool SAD, int inc>
void calcDiff_SADorSSD_Generic_c(const pixel_t* prvp, const pixel_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff,
  bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt,
  const VSVideoInfo *vi)
{
    (void)chroma;

  int temp1, temp2, u;

  // 16 bits SSD requires int64 intermediate
  typedef typename std::conditional<sizeof(pixel_t) == 1 && !SAD, int, int64_t> ::type safeint_t;

  safeint_t difft; // int or 64 bits
  int diffs; // pixel differences are internally scaled back to 8 bit range to avoid overflow
  int box1, box2;
  int yshift, yhalf, xshift, xhalf;
  int heighta, widtha;
  const pixel_t* prvpT, * curpT;

  const int bits_per_pixel = vi->format->bitsPerSample;
  const int shift_count = SAD ? (bits_per_pixel - 8) : 2 * (bits_per_pixel - 8);

  {
    const int ysubsampling = plane == 0 ? 0 : vi->format->subSamplingH;
    const int xsubsampling = plane == 0 ? 0 : vi->format->subSamplingW;
    yshift = yshiftS - ysubsampling;
    yhalf = yhalfS >> ysubsampling;
    xshift = xshiftS - xsubsampling;
    xhalf = xhalfS >> xsubsampling;
  }

  heighta = (height >> (yshift - 1)) << (yshift - 1);
  widtha = (width >> (xshift - 1)) << (xshift - 1);
  // whole blocks
  for (int y = 0; y < heighta; y += yhalf)
  {
    temp1 = (y >> yshift) * xblocks4;
    temp2 = ((y + yhalf) >> yshift) * xblocks4;
    for (int x = 0; x < widtha; x += xhalf)
    {
      prvpT = prvp;
      curpT = curp;
      for (diffs = 0, u = 0; u < yhalf; ++u)
      {
        for (int v = 0; v < xhalf; v += inc)
        {
          if constexpr (SAD) {
            difft = abs(prvpT[x + v] - curpT[x + v]);
          }
          else {
            difft = prvpT[x + v] - curpT[x + v];
            difft *= difft;
          }
          if constexpr (sizeof(pixel_t) == 2) difft >>= shift_count; // back to 8 bit range

          if (difft > nt) diffs += static_cast<int>(difft);
        }
        prvpT += prv_pitch;
        curpT += cur_pitch;
      }
      if (diffs > nt)
      {
        box1 = (x >> xshift) << 2;
        box2 = ((x + xhalf) >> xshift) << 2;
        diff[temp1 + box1 + 0] += diffs;
        diff[temp1 + box2 + 1] += diffs;
        diff[temp2 + box1 + 2] += diffs;
        diff[temp2 + box2 + 3] += diffs;
      }
    }
    // rest non - whole block on the right
    for (int x = widtha; x < width; x += inc)
    {
      prvpT = prvp;
      curpT = curp;
      for (diffs = 0, u = 0; u < yhalf; ++u)
      {
        if constexpr (SAD) {
          difft = abs(prvpT[x] - curpT[x]);
        }
        else {
          difft = prvpT[x] - curpT[x];
          difft *= difft;
        }
        if constexpr (sizeof(pixel_t) == 2) difft >>= shift_count; // back to 8 bit range
        if (difft > nt) diffs += static_cast<int>(difft);
        prvpT += prv_pitch;
        curpT += cur_pitch;
      }
      if (diffs > nt)
      {
        box1 = (x >> xshift) << 2;
        box2 = ((x + xhalf) >> xshift) << 2;
        diff[temp1 + box1 + 0] += diffs;
        diff[temp1 + box2 + 1] += diffs;
        diff[temp2 + box1 + 2] += diffs;
        diff[temp2 + box2 + 3] += diffs;
      }
    }
    prvp += prv_pitch * yhalf;
    curp += cur_pitch * yhalf;
  }
  // rest non-whole block at the bottom
  for (int y = heighta; y < height; ++y)
  {
    temp1 = (y >> yshift) * xblocks4;
    temp2 = ((y + yhalf) >> yshift) * xblocks4;
    for (int x = 0; x < width; x += inc)
    {
      if constexpr (SAD) {
        difft = abs(prvp[x] - curp[x]);
      }
      else {
        difft = prvp[x] - curp[x];
        difft *= difft;
      }
      if constexpr (sizeof(pixel_t) == 2) difft >>= shift_count; // back to 8 bit range
      if (difft > nt)
      {
        box1 = (x >> xshift) << 2;
        box2 = ((x + xhalf) >> xshift) << 2;
        diff[temp1 + box1 + 0] += difft;
        diff[temp1 + box2 + 1] += difft;
        diff[temp2 + box1 + 2] += difft;
        diff[temp2 + box2 + 3] += difft;
      }
    }
    prvp += prv_pitch;
    curp += cur_pitch;
  }
}

// instantiate
template void calcDiff_SADorSSD_Generic_c<uint8_t, false, 1>(const uint8_t* prvp, const uint8_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);
template void calcDiff_SADorSSD_Generic_c<uint8_t, false, 2>(const uint8_t* prvp, const uint8_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);
template void calcDiff_SADorSSD_Generic_c<uint8_t, true, 1>(const uint8_t* prvp, const uint8_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);
template void calcDiff_SADorSSD_Generic_c<uint8_t, true, 2>(const uint8_t* prvp, const uint8_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);

template void calcDiff_SADorSSD_Generic_c<uint16_t, false, 1>(const uint16_t* prvp, const uint16_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);
template void calcDiff_SADorSSD_Generic_c<uint16_t, true, 1>(const uint16_t* prvp, const uint16_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);


0707010000000A000081A4000000000000000000000001671240C900001A25000000000000000000000000000000000000003200000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateASM.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#ifndef __TDECIMATEASM_H__
#define __TDECIMATEASM_H__

//#include <windows.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <VapourSynth.h>
#include "internal.h"
#include "TDecimate.h"

//void HorizontalBlurSSE2_YUY2_R_luma(const uint8_t* srcp, uint8_t* dstp, int src_pitch, int dst_pitch, int width, int height);
//void HorizontalBlurSSE2_YUY2_R(const uint8_t* srcp, uint8_t* dstp, int src_pitch, int dst_pitch, int width, int height);
void VerticalBlurSSE2_R(const uint8_t* srcp, uint8_t* dstp, int src_pitch, int dst_pitch, int width, int height);
void HorizontalBlurSSE2_Planar_R(const uint8_t* srcp, uint8_t* dstp, int src_pitch, int dst_pitch, int width, int height);

// used for YUY2
//void calcLumaDiffYUY2SSD_SSE2_16(const uint8_t* prvp, const uint8_t* nxtp,
//  int width, int height, int prv_pitch, int nxt_pitch, uint64_t& ssd);
//void calcLumaDiffYUY2SAD_SSE2_16(const uint8_t* prvp, const uint8_t* nxtp,
//  int width, int height, int prv_pitch, int nxt_pitch, uint64_t& sad);
//void calcSSD_SSE2_32x16_YUY2_lumaonly(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& ssd);
//void calcSSD_SSE2_32x16(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& ssd);
//void calcSSD_SSE2_8x8_YUY2_lumaonly(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& ssd);
//void calcSAD_SSE2_32x16(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& sad);
//void calcSAD_SSE2_32x16_YUY2_lumaonly(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& sad);
//void calcSAD_SSE2_8x8_YUY2_lumaonly(const uint8_t* ptr1, const uint8_t* ptr2,
//  int pitch1, int pitch2, int& sad); // PF new


// generic
template<int blkSizeY>
void calcSSD_SSE2_16xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &ssd);
template<int blkSizeY>
void calcSSD_SSE2_8xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &ssd);
template<int blkSizeY>
void calcSSD_SSE2_4xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &ssd);

template<int blkSizeY>
void calcSAD_SSE2_8xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &sad); 
template<int blkSizeY>
void calcSAD_SSE2_4xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &sad);
template<int blkSizeY>
void calcSAD_SSE2_16xN(const uint8_t *ptr1, const uint8_t *ptr2, int pitch1, int pitch2, int &sad);


//-- helpers
void calcDiffSAD_32x32_SSE2(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t *diff, bool chroma, const VSVideoInfo *vi);

void calcDiffSSD_32x32_SSE2(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t *diff, bool chroma, const VSVideoInfo *vi);

void calcDiffSSD_Generic_SSE2(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t *diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, const VSVideoInfo *vi);

void calcDiffSAD_Generic_SSE2(const uint8_t *ptr1, const uint8_t *ptr2,
  int pitch1, int pitch2, int width, int height, int plane, int xblocks4, uint64_t *diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, const VSVideoInfo *vi);

template<typename pixel_t, bool SAD, int inc>
void calcDiff_SADorSSD_Generic_c(const pixel_t* prvp, const pixel_t* curp,
  int prv_pitch, int cur_pitch, int width, int height, int plane, int xblocks4, uint64_t* diff, bool chroma, int xshiftS, int yshiftS, int xhalfS, int yhalfS, int nt, const VSVideoInfo *vi);

void CalcMetricsExtracted(const VSFrameRef *prevt, const VSFrameRef *currt, CalcMetricData& d, VSCore *core, const VSAPI *vsapi);

template<typename pixel_t>
void HorizontalBlur_Planar_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
  int dst_pitch, int width, int height, bool allow_leftminus1);
//void HorizontalBlur_YUY2_lumaonly_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, bool allow_leftminus1);
//void HorizontalBlur_YUY2_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, bool allow_leftminus1);

void HorizontalBlur_Planar_SSE2(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
  int dst_pitch, int width, int height);
//void HorizontalBlur_YUY2_lumaonly_SSE2(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height);
//void HorizontalBlur_YUY2_SSE2(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height);

void HorizontalBlur(const VSFrameRef *src, VSFrameRef *dst, bool bchroma,
  const CPUFeatures *cpuFlags, const VSAPI *vsapi);

template<typename pixel_t>
void VerticalBlur_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
  int dst_pitch, int width, int height);
//void VerticalBlur_YUY2_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, int inc);

void VerticalBlur_SSE2(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
  int dst_pitch, int width, int height);

void VerticalBlur(const VSFrameRef *src, VSFrameRef *dst, bool bchroma, const CPUFeatures *opti, const VSAPI *vsapi);


// handles 50% special case as well
void dispatch_blend(uint8_t* dstp, const uint8_t* srcp1, const uint8_t* srcp2, int width, int height,
  int dst_pitch, int src1_pitch, int src2_pitch, int weight_i, int bits_per_pixel, const CPUFeatures *cpuFlags);

#endif // __TDECIMATEASM_H__
0707010000000B000081A4000000000000000000000001671240C9000037EE000000000000000000000000000000000000003500000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateBlur.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include "TDecimateASM.h"

// hbd ready
void blurFrame(const VSFrameRef *src, VSFrameRef *dst, int iterations,
  bool bchroma, const CPUFeatures *cpuFlags, VSCore *core, const VSAPI *vsapi)
{
    const VSFormat *format = vsapi->getFrameFormat(src);
    int width = vsapi->getFrameWidth(src, 0);
    int height = vsapi->getFrameHeight(src, 0);

  VSFrameRef *tmp = vsapi->newVideoFrame(format, width, height, nullptr, core);
  HorizontalBlur(src, tmp, bchroma, cpuFlags, vsapi);
  VerticalBlur(tmp, dst, bchroma, cpuFlags, vsapi);
  for (int i = 1; i < iterations; ++i)
  {
    HorizontalBlur(dst, tmp, bchroma, cpuFlags, vsapi);
    VerticalBlur(tmp, dst, bchroma, cpuFlags, vsapi);
  }
  vsapi->freeFrame(tmp);
}

void HorizontalBlur(const VSFrameRef *src, VSFrameRef *dst, bool bchroma,
  const CPUFeatures *cpuFlags, const VSAPI *vsapi)
{
    const VSFormat *format = vsapi->getFrameFormat(src);

  const int np = !bchroma ? 1 : format->numPlanes;

  const bool use_sse2 = cpuFlags->sse2;

  const int pixelsize = format->bytesPerSample;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    const uint8_t *srcp = vsapi->getReadPtr(src, plane);
    int src_pitch = vsapi->getStride(src, plane);
    int width = vsapi->getFrameWidth(src, plane);
    int widtha = (width >> 3) << 3; // mod 8
    int height = vsapi->getFrameHeight(src, plane);
    uint8_t *dstp = vsapi->getWritePtr(dst, plane);
    int dst_pitch = vsapi->getStride(dst, plane);

      if (pixelsize == 1 && use_sse2 && width >= 8)
      {
        // always mod 8, sse2 unaligned!
        HorizontalBlur_Planar_SSE2(srcp, dstp, src_pitch, dst_pitch, widtha, height);
        // rest non mod 8 no the right
        HorizontalBlur_Planar_c<uint8_t>(srcp + widtha, dstp + widtha, src_pitch, dst_pitch, width - widtha, height, true);
      }
      else
      {
        // fixme: implement SIMD for 10-16 bits
        if(pixelsize == 1)
          HorizontalBlur_Planar_c<uint8_t>(srcp, dstp, src_pitch, dst_pitch, width, height, false);
        else // 10-16 bits
          HorizontalBlur_Planar_c<uint16_t>(srcp, dstp, src_pitch, dst_pitch, width, height, false);
      }
  }
}

template<typename pixel_t>
void VerticalBlur_c(const uint8_t* srcp0, uint8_t* dstp0, int src_pitch,
  int dst_pitch, int width, int height)
{
  if (width == 0) return;

  pixel_t* dstp = reinterpret_cast<pixel_t *>(dstp0);
  const pixel_t* srcp = reinterpret_cast<const pixel_t*>(srcp0);
  const pixel_t* srcpp = reinterpret_cast<const pixel_t*>(srcp0 - src_pitch);
  const pixel_t* srcpn = reinterpret_cast<const pixel_t*>(srcp0 + src_pitch);
  src_pitch /= sizeof(pixel_t);
  dst_pitch /= sizeof(pixel_t);

  // top line
  for (int x = 0; x < width; x++)
    dstp[x] = (srcp[x] + srcpn[x] + 1) >> 1;
  srcpp += src_pitch;
  srcp += src_pitch;
  srcpn += src_pitch;
  dstp += dst_pitch;
  // height - 2 lines in between
  for (int y = 1; y < height - 1; ++y)
  {
    for (int x = 0; x < width; x++)
      dstp[x] = (srcpp[x] + (srcp[x] << 1) + srcpn[x] + 2) >> 2;
    srcpp += src_pitch;
    srcp += src_pitch;
    srcpn += src_pitch;
    dstp += dst_pitch;
  }
  // bottom line
  for (int x = 0; x < width; x++)
    dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
}

//void VerticalBlur_YUY2_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, int inc)
//{
//  if (width == 0) return;

//  const uint8_t* srcpp = srcp - src_pitch;
//  const uint8_t* srcpn = srcp + src_pitch;
//  // top line
//  for (int x = 0; x < width; x += inc)
//    dstp[x] = (srcp[x] + srcpn[x] + 1) >> 1;
//  srcpp += src_pitch;
//  srcp += src_pitch;
//  srcpn += src_pitch;
//  dstp += dst_pitch;
//  // height - 2 lines in between
//  for (int y = 1; y < height - 1; ++y)
//  {
//    for (int x = 0; x < width; x += inc)
//      dstp[x] = (srcpp[x] + (srcp[x] << 1) + srcpn[x] + 2) >> 2;
//    srcpp += src_pitch;
//    srcp += src_pitch;
//    srcpn += src_pitch;
//    dstp += dst_pitch;
//  }
//  // bottom line
//  for (int x = 0; x < width; x += inc)
//    dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
//}

void VerticalBlur_SSE2(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
  int dst_pitch, int width, int height)
{
  VerticalBlurSSE2_R(srcp + src_pitch, dstp + dst_pitch, src_pitch, dst_pitch, width, height - 2);
  int temps = (height - 1) * src_pitch;
  int tempd = (height - 1) * dst_pitch;
  for (int x = 0; x < width; ++x)
  {
    dstp[x] = (srcp[x] + srcp[x + src_pitch] + 1) >> 1;
    dstp[tempd + x] = (srcp[temps + x] + srcp[temps + x - src_pitch] + 1) >> 1;
  }
}

void VerticalBlur(const VSFrameRef *src, VSFrameRef *dst, bool bchroma,
  const CPUFeatures *cpuFlags, const VSAPI *vsapi)
{
    const VSFormat *format = vsapi->getFrameFormat(src);

  const int np = !bchroma ? 1 : format->numPlanes;

  const bool use_sse2 = cpuFlags->sse2;

  const int pixelsize = format->bytesPerSample;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    const uint8_t* srcp = vsapi->getReadPtr(src, plane);
    int src_pitch = vsapi->getStride(src, plane);
    int width = vsapi->getFrameWidth(src, plane);
    int widtha = (width >> 4) << 4; // mod 16
    int height = vsapi->getFrameHeight(src, plane);
    uint8_t* dstp = vsapi->getWritePtr(dst, plane);
    int dst_pitch = vsapi->getStride(dst, plane);

      if (pixelsize == 1 && use_sse2 && widtha >= 16)
      {
        // 16x block is Ok
        VerticalBlur_SSE2(srcp, dstp, src_pitch, dst_pitch, widtha, height);
        //the rest on the right not covered by SIMD
        VerticalBlur_c<uint8_t>(srcp + widtha, dstp + widtha, src_pitch, dst_pitch, width - widtha, height);
      }
      else {
        // fixme: implement SIMD for 10-16 bits
        if(pixelsize == 1)
          VerticalBlur_c<uint8_t>(srcp, dstp, src_pitch, dst_pitch, width, height);
        else // 10-16 bits
          VerticalBlur_c<uint16_t>(srcp, dstp, src_pitch, dst_pitch, width, height);
      }

  }
}

template<typename pixel_t>
void HorizontalBlur_Planar_c(const uint8_t* srcp0, uint8_t* dstp0, int src_pitch,
  int dst_pitch, int width, int height, bool allow_leftminus1)
{
  if (width == 0)
    return;

  pixel_t* dstp = reinterpret_cast<pixel_t*>(dstp0);
  const pixel_t* srcp = reinterpret_cast<const pixel_t*>(srcp0);
  src_pitch /= sizeof(pixel_t);
  dst_pitch /= sizeof(pixel_t);

  if (width >= 2) {
    const int startx = allow_leftminus1 ? 0 : 1;
    for (int y = 0; y < height; ++y)
    {
      if (!allow_leftminus1)
        dstp[0] = (srcp[0] + srcp[1] + 1) >> 1;
      int x;
      for (x = startx; x < width - 1; ++x)
        dstp[x] = (srcp[x - 1] + (srcp[x] << 1) + srcp[x + 1] + 2) >> 2;
      dstp[x] = (srcp[x - 1] + srcp[x] + 1) >> 1;
      srcp += src_pitch;
      dstp += dst_pitch;
    }
    return;
  }

  // width == 1
  for (int y = 0; y < height; ++y)
  {
    if (allow_leftminus1)
      dstp[0] = (srcp[-1] + srcp[0] + 1) >> 1;
    else
      dstp[0] = srcp[0];
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}

//void HorizontalBlur_YUY2_lumaonly_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, bool allow_leftminus1)
//{
//  if (width == 0)
//    return;

//  // YUYV minimum width is 4, at least two luma
//  const int startx = allow_leftminus1 ? 0 : 2;
//  for (int y = 0; y < height; ++y)
//  {
//    if (!allow_leftminus1)
//      dstp[0] = (srcp[0] + srcp[2] + 1) >> 1;
//    int x;
//    for (x = startx; x < width - 2; ++x)
//      dstp[x] = (srcp[x - 2] + (srcp[x] << 1) + srcp[x + 2] + 2) >> 2;
//    dstp[x] = (srcp[x - 2] + srcp[x] + 1) >> 1;
//    srcp += src_pitch;
//    dstp += dst_pitch;
//  }
//}

//void HorizontalBlur_YUY2_c(const uint8_t* srcp, uint8_t* dstp, int src_pitch,
//  int dst_pitch, int width, int height, bool allow_leftminus1)
//{
//  // width is rowwidth
//  if (width == 0)
//    return;

//  // YUYV minimum rowsize is 4, at least two luma
//  const int startx = allow_leftminus1 ? 0 : 4;

//  if (width >= 8) {
//    for (int y = 0; y < height; ++y)
//    {
//      if (!allow_leftminus1) {
//        dstp[0] = (srcp[-2] + (srcp[0] << 1) + srcp[2] + 2) >> 2; // Y
//        dstp[1] = (srcp[-3] + (srcp[1] << 1) + srcp[5] + 2) >> 2; // U
//        dstp[2] = (srcp[0] + (srcp[2] << 1) + srcp[4] + 2) >> 2; // Y
//        dstp[3] = (srcp[-1] + (srcp[3] << 1) + srcp[7] + 2) >> 2; // V
//      }
//      int x;
//      for (x = startx; x < width - 4; ++x)
//      {
//        dstp[x] = (srcp[x - 2] + (srcp[x] << 1) + srcp[x + 2] + 2) >> 2; // Y
//        ++x;
//        dstp[x] = (srcp[x - 4] + (srcp[x] << 1) + srcp[x + 4] + 2) >> 2; // U or V
//      }
//      dstp[x] = (srcp[x - 2] + (srcp[x] << 1) + srcp[x + 2] + 2) >> 2; // Y
//      ++x;
//      dstp[x] = (srcp[x - 4] + srcp[x] + 1) >> 1; // U
//      ++x;
//      dstp[x] = (srcp[x - 2] + srcp[x] + 1) >> 1; // Y
//      ++x;
//      dstp[x] = (srcp[x - 4] + srcp[x] + 1) >> 1; // V
//      srcp += src_pitch;
//      dstp += dst_pitch;
//    }
//    return;
//  }

//  // width (rowsize) == 4
//  for (int y = 0; y < height; ++y)
//  {
//    if (allow_leftminus1) {
//      dstp[0] = (srcp[-2] + (srcp[0] << 1) + srcp[2] + 2) >> 2; // Y
//      dstp[1] = (srcp[-3] + srcp[1] + 1) >> 1; // U
//      dstp[2] = (srcp[0] + srcp[2] + 1) >> 1; // Y
//      dstp[3] = (srcp[-1] + srcp[3] + 1) >> 1; // V
//    }
//    else {
//      dstp[0] = (srcp[0] + srcp[2] + 1) >> 1; // Y
//      dstp[1] = srcp[1]; // U
//      dstp[2] = (srcp[0] + srcp[2] + 1) >> 1; // Y
//      dstp[3] = srcp[3]; // V
//    }
//    srcp += src_pitch;
//    dstp += dst_pitch;
//  }

//}

// always mod 8, sse2 unaligned
void HorizontalBlur_Planar_SSE2(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
  int dst_pitch, int width, int height)
{
  // left and right 8 pixel is omitted in SIMD, special
  HorizontalBlurSSE2_Planar_R(srcp + 8, dstp + 8, src_pitch, dst_pitch, width - 16, height);
  for (int y = 0; y < height; ++y)
  {
    dstp[0] = (srcp[0] + srcp[1] + 1) >> 1;
    dstp[1] = (srcp[0] + (srcp[1] << 1) + srcp[2] + 2) >> 2;
    dstp[2] = (srcp[1] + (srcp[2] << 1) + srcp[3] + 2) >> 2;
    dstp[3] = (srcp[2] + (srcp[3] << 1) + srcp[4] + 2) >> 2;
    // 4-7
    for (int x = 4; x < 8; ++x)
      dstp[x] = (srcp[x - 1] + (srcp[x] << 1) + srcp[x + 1] + 2) >> 2;
    for (int x = width - 8; x < width - 4; ++x)
      dstp[x] = (srcp[x - 1] + (srcp[x] << 1) + srcp[x + 1] + 2) >> 2;
    // -8..-5
    dstp[width - 4] = (srcp[width - 5] + (srcp[width - 4] << 1) + srcp[width - 3] + 2) >> 2;
    dstp[width - 3] = (srcp[width - 4] + (srcp[width - 3] << 1) + srcp[width - 2] + 2) >> 2;
    dstp[width - 2] = (srcp[width - 3] + (srcp[width - 2] << 1) + srcp[width - 1] + 2) >> 2;
    dstp[width - 1] = (srcp[width - 2] + srcp[width - 1] + 1) >> 1;
    srcp += src_pitch;
    dstp += dst_pitch;
  }
}

//void HorizontalBlur_YUY2_lumaonly_SSE2(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
//  int dst_pitch, int width, int height)
//{
//  HorizontalBlurSSE2_YUY2_R_luma(srcp + 8, dstp + 8, src_pitch, dst_pitch, width - 16, height);

//  for (int y = 0; y < height; ++y)
//  {
//    dstp[0] = (srcp[0] + srcp[2] + 1) >> 1;
//    dstp[2] = (srcp[0] + (srcp[2] << 1) + srcp[4] + 2) >> 2;
//    dstp[4] = (srcp[2] + (srcp[4] << 1) + srcp[6] + 2) >> 2;
//    dstp[6] = (srcp[4] + (srcp[6] << 1) + srcp[8] + 2) >> 2;
//    dstp[width - 8] = (srcp[width - 10] + (srcp[width - 8] << 1) + srcp[width - 6] + 2) >> 2;
//    dstp[width - 6] = (srcp[width - 8] + (srcp[width - 6] << 1) + srcp[width - 4] + 2) >> 2;
//    dstp[width - 4] = (srcp[width - 6] + (srcp[width - 4] << 1) + srcp[width - 2] + 2) >> 2;
//    dstp[width - 2] = (srcp[width - 4] + srcp[width - 2] + 1) >> 1;
//    srcp += src_pitch;
//    dstp += dst_pitch;
//  }
//}

//void HorizontalBlur_YUY2_SSE2(const uint8_t *srcp, uint8_t *dstp, int src_pitch,
//  int dst_pitch, int width, int height)
//{
//  HorizontalBlurSSE2_YUY2_R(srcp + 8, dstp + 8, src_pitch, dst_pitch, width - 16, height);
//  for (int y = 0; y < height; ++y)
//  {
//    dstp[0] = (srcp[0] + srcp[2] + 1) >> 1;
//    dstp[1] = (srcp[1] + srcp[5] + 1) >> 1;
//    dstp[2] = (srcp[0] + (srcp[2] << 1) + srcp[4] + 2) >> 2;
//    dstp[3] = (srcp[3] + srcp[7] + 1) >> 1;
//    dstp[4] = (srcp[2] + (srcp[4] << 1) + srcp[6] + 2) >> 2;
//    dstp[5] = (srcp[1] + (srcp[5] << 1) + srcp[9] + 2) >> 2;
//    dstp[6] = (srcp[4] + (srcp[6] << 1) + srcp[8] + 2) >> 2;
//    dstp[7] = (srcp[3] + (srcp[7] << 1) + srcp[11] + 2) >> 2;
//    dstp[width - 8] = (srcp[width - 10] + (srcp[width - 8] << 1) + srcp[width - 6] + 2) >> 2;
//    dstp[width - 7] = (srcp[width - 11] + (srcp[width - 7] << 1) + srcp[width - 3] + 2) >> 2;
//    dstp[width - 6] = (srcp[width - 8] + (srcp[width - 6] << 1) + srcp[width - 4] + 2) >> 2;
//    dstp[width - 5] = (srcp[width - 9] + (srcp[width - 5] << 1) + srcp[width - 1] + 2) >> 2;
//    dstp[width - 4] = (srcp[width - 6] + (srcp[width - 4] << 1) + srcp[width - 2] + 2) >> 2;
//    dstp[width - 3] = (srcp[width - 7] + srcp[width - 3] + 1) >> 1;
//    dstp[width - 2] = (srcp[width - 4] + srcp[width - 2] + 1) >> 1;
//    dstp[width - 1] = (srcp[width - 5] + srcp[width - 1] + 1) >> 1;
//    srcp += src_pitch;
//    dstp += dst_pitch;
//  }
//}
0707010000000C000081A4000000000000000000000001671240C90000436E000000000000000000000000000000000000003600000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateMode2.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include <algorithm>


const VSFrameRef * TDecimate::GetFrameMode2(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core)
{
    if (activationReason != arInitial && activationReason != arAllFramesReady)
        return nullptr;

  int ret = -20;
  if (mode2_numCycles >= 0)
  {
    int cycleF = -20;
    for (int x = 0; x < mode2_numCycles; ++x)
    {
      if (aLUT[x * 5 + 1] <= n && aLUT[x * 5 + 3] > n)
      {
        cycleF = x;
        break;
      }
    }

    if (activationReason == arInitial) {
        if (cycleF > 0) {
            int start = aLUT[(cycleF - 1) * 5] - 1;
            int end = start + curr.length;
            for (int i = start; i < end; i++)
                vsapi->requestFrameFilter(std::max(0, std::min(i, vi_child->numFrames - 1)), child, frameCtx);
        }

        {
            int start = aLUT[cycleF * 5] - 1;
            int end = start + curr.length;
            for (int i = start; i < end; i++)
                vsapi->requestFrameFilter(std::max(0, std::min(i, vi_child->numFrames - 1)), child, frameCtx);
        }

        if (cycleF < mode2_numCycles - 1) {
            int start = aLUT[(cycleF + 1) * 5] - 1;
            int end = start + curr.length;
            for (int i = start; i < end; i++)
                vsapi->requestFrameFilter(std::max(0, std::min(i, vi_child->numFrames - 1)), child, frameCtx);
        }

        return nullptr;
    }

    if (cycleF > 0 && prev.frame != aLUT[(cycleF - 1) * 5])
    {
      if (curr.frame == aLUT[(cycleF - 1) * 5]) prev = curr;
      else
      {
        prev.setFrame(aLUT[(cycleF - 1) * 5]);
        getOvrCycle(prev, true);
        calcMetricCycle(prev, true, false, core, frameCtx);
        addMetricCycle(prev);
      }
    }
    else if (cycleF <= 0) prev.setFrame(-prev.length);

    if (curr.frame != aLUT[cycleF * 5])
    {
      if (next.frame == aLUT[cycleF * 5]) curr = next;
      else
      {
        curr.setFrame(aLUT[cycleF * 5]);
        getOvrCycle(curr, true);
        calcMetricCycle(curr, true, false, core, frameCtx);
        addMetricCycle(curr);
      }
    }

    if (cycleF < mode2_numCycles - 1 && next.frame != aLUT[(cycleF + 1) * 5])
    {
      next.setFrame(aLUT[(cycleF + 1) * 5]);
      getOvrCycle(next, true);
      calcMetricCycle(next, true, false, core, frameCtx);
      addMetricCycle(next);
    }
    else if (cycleF >= mode2_numCycles - 1) next.setFrame(-next.length);

    mode2MarkDecFrames(cycleF);
    ret = getNonDecMode2(n - aLUT[cycleF * 5 + 1], aLUT[cycleF * 5], aLUT[cycleF * 5 + 2]);
  }
  else ret = aLUT[n];

  if (ret < 0) {
    vsapi->setFilterError("TDecimate:  mode 2 internal error (ret less than 0). Please report this ASAP!", frameCtx);
    return nullptr;
  }

  if (activationReason == arInitial || (activationReason == arAllFramesReady && (intptr_t)*frameData != RetFrameIsReady)) {
      vsapi->requestFrameFilter(ret, clip2, frameCtx);
      *frameData = (void *)RetFrameIsReady;
      return nullptr;
  }

//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  inframe = %d  useframe = %d  rate = %3.6f\n", n, ret, rate);
//    OutputDebugString(buf);
//  }

  const VSFrameRef *src = vsapi->getFrameFilter(ret, clip2, frameCtx);

  if (display)
  {
    VSFrameRef *dst = vsapi->copyFrame(src, core);
    vsapi->freeFrame(src);

#define SZ 160
    char buf[SZ] = { 0 };

    std::string text = "TDecimate " VERSION " by tritical\n";

    snprintf(buf, SZ, "Mode: 2  Rate = %3.6f\n", rate);
    text += buf;
    snprintf(buf, SZ, "inframe = %d  useframe = %d\n", n, ret);
    text += buf;
#undef SZ

    VSMap *props = vsapi->getFramePropsRW(dst);
    vsapi->propSetData(props, PROP_TDecimateDisplay, text.c_str(), text.size(), paReplace);

    return dst;
  }
  return src;
}

int TDecimate::getNonDecMode2(int n, int start, int stop) const
{
  int count = -1, ret = -1;
  for (int i = start; i < stop; ++i)
  {
    if (mode2_decA[i] == 0) ++count;
    if (count == n) { ret = i; break; }
  }
  return ret;
}

void TDecimate::mode2MarkDecFrames(int cycleF)
{
  for (int i = curr.frame; i < curr.frameEO; ++i)
  {
    if (mode2_decA[i] != -20) return;
    mode2_decA[i] = 0;
  }
  removeMinN(mode2_num, mode2_den, curr.frame, curr.frameEO);
  for (int x = 0; x < 10; ++x)
  {
    if (mode2_cfs[x] <= 0) break;
    if (aLUT[cycleF * 5 + 4] & (1 << x))
    {
      if (mode2_cfs[x] <= curr.length)
        removeMinN(1, mode2_cfs[x], curr.frame, curr.frameEO);
      else
        removeMinN(1, curr.length, curr.frame, curr.frameEO);
    }
  }
}

void TDecimate::removeMinN(int m, int n, int start, int stop)
{
  for (int x = start; x < stop; x += n)
  {
    int dec = 0, t = 0, stop2 = n;
    if (x + n - 1 > nfrms)
    {
      m = (int)(double((nfrms - x + 1)*m) / double(n) + 0.5);
      if (m < 1) continue;
      stop2 = nfrms - x + 1;
    }
    if (curr.dupCount > 0)
    {
      int b = x - start;
      for (int i = 0; i < stop2; ++i, ++b)
      {
        if (curr.dupArray[b] == 1 && dec < m)
        {
          mode2_decA[x + i] = 1;
          --curr.dupCount;
          curr.dupArray[b] = 0;
          ++dec;
        }
      }
      if (dec >= m) continue;
    }
    for (int i = 0; i < stop2; ++i)
    {
      if (mode2_decA[x + i] == 0)
      {
        int v = 1;
        double cM = (metricsOutArray[(x + i) << 1] * 100.0) / MAX_DIFF;
        double pM = -20.0, nM = -20.0;
        while (pM < 0 || nM < 0)
        {
          if (pM < 0)
          {
            if (x + i - v >= 0)
            {
              if (mode2_decA[x + i - v] == 0 || mode2_decA[x + i - v] == -20)
                pM = (metricsOutArray[(x + i - v) << 1] * 100.0) / MAX_DIFF;
            }
            else pM = 1.0;
          }
          if (nM < 0)
          {
            if (x + i + v <= nfrms)
            {
              if (mode2_decA[x + i + v] == 0 || mode2_decA[x + i + v] == -20)
                nM = (metricsOutArray[(x + i + v) << 1] * 100.0) / MAX_DIFF;
            }
            else nM = 1.0;
          }
          ++v;
        }
        if (pM >= 3.0 && nM >= 3.0 && cM < 3.0 && pM*0.5 > cM && nM*0.5 > cM)
        {
          mode2_order[t] = i;
          mode2_metrics[t] = (int)(std::min(pM - cM, nM - cM)*10000.0 + 0.5);
          ++t;
        }
      }
    }
    if (t > 0)
    {
      sortMetrics(mode2_metrics.data(), mode2_order.data(), t);
      for (int i = 0; i < t && dec < m; ++i)
      {
        if (mode2_decA[x + mode2_order[t - 1 - i]] != 1)
        {
          mode2_decA[x + mode2_order[t - 1 - i]] = 1;
          ++dec;
        }
      }
    }
    if (dec >= m) continue;
    for (int i = 0; i < stop2; ++i)
    {
      mode2_order[i] = i;
      mode2_metrics[i] = metricsOutArray[(x + i) << 1];
    }
    sortMetrics(mode2_metrics.data(), mode2_order.data(), n);
    for (int i = 0; i < stop2 && dec < m; ++i)
    {
      if (mode2_decA[x + mode2_order[i]] != 1)
      {
        mode2_decA[x + mode2_order[i]] = 1;
        ++dec;
      }
    }
  }
}

void TDecimate::removeMinN(int m, int n, uint64_t *metricsT, int *orderT, int &ovrC)
{
  for (int x = 0; x < vi.numFrames; x += n)
  {
    int dec = 0, t = 0, stop2 = n;
    if (x + n - 1 > nfrms)
    {
      m = (int)(double((nfrms - x + 1)*m) / double(n) + 0.5);
      if (m < 1) continue;
      stop2 = nfrms - x + 1;
    }
    if (ovrC > 0 && ovrArray.size())
    {
      for (int i = 0; i < stop2; ++i)
      {
        if ((ovrArray[x + i] & DROP_FRAME) && dec < m)
        {
          mode2_decA[x + i] = 1;
          --ovrC;
          ovrArray[x + i] &= ~DROP_FRAME;
          ++dec;
        }
      }
      if (dec >= m) continue;
    }
    for (int i = 0; i < stop2; ++i)
    {
      if (mode2_decA[x + i] == 0)
      {
        int v = 1;
        double cM = (metricsArray[(x + i) << 1] * 100.0) / MAX_DIFF;
        double pM = -20.0, nM = -20.0;
        while (pM < 0 || nM < 0)
        {
          if (pM < 0)
          {
            if (x + i - v >= 0)
            {
              if (mode2_decA[x + i - v] != 1)
                pM = (metricsArray[(x + i - v) << 1] * 100.0) / MAX_DIFF;
            }
            else pM = 1.0;
          }
          if (nM < 0)
          {
            if (x + i + v <= nfrms)
            {
              if (mode2_decA[x + i + v] != 1)
                nM = (metricsArray[(x + i + v) << 1] * 100.0) / MAX_DIFF;
            }
            else nM = 1.0;
          }
          ++v;
        }
        if (pM >= 3.0 && nM >= 3.0 && cM < 3.0 && pM*0.5 > cM && nM*0.5 > cM)
        {
          orderT[t] = i;
          metricsT[t] = (int)(std::min(pM - cM, nM - cM)*10000.0 + 0.5);
          ++t;
        }
      }
    }
    if (t > 0)
    {
      sortMetrics(metricsT, orderT, t);
      for (int i = 0; i < t && dec < m; ++i)
      {
        if (mode2_decA[x + orderT[t - 1 - i]] != 1)
        {
          mode2_decA[x + orderT[t - 1 - i]] = 1;
          ++dec;
        }
      }
    }
    if (dec >= m) continue;
    for (int i = 0; i < stop2; ++i)
    {
      orderT[i] = i;
      metricsT[i] = metricsArray[(x + i) << 1];
    }
    sortMetrics(metricsT, orderT, stop2);
    for (int i = 0; i < stop2 && dec < m; ++i)
    {
      if (mode2_decA[x + orderT[i]] != 1)
      {
        mode2_decA[x + orderT[i]] = 1;
        ++dec;
      }
    }
  }
}

void TDecimate::sortMetrics(uint64_t *metrics, int *order, int length) const
{
  for (int i = 1; i < length; ++i)
  {
    int j = i;
    const uint64_t temp1 = metrics[j];
    const int temp2 = order[j];
    while (j > 0 && metrics[j - 1] > temp1)
    {
      metrics[j] = metrics[j - 1];
      order[j] = order[j - 1];
      --j;
    }
    metrics[j] = temp1;
    order[j] = temp2;
  }
}

int TDecimate::findDivisor(double decRatio, int min_den) const
{
  int ret = -20;
  double num = 1.0, lowest = 5.0;
  double offset = 0.00000001;
  for (int x = min_den; x <= 100; ++x)
  {
    double temp = num / ((double)x);
    if (temp > decRatio + offset) continue;
    if (fabs(temp - decRatio) < lowest)
    {
      lowest = fabs(temp - decRatio);
      ret = x;
    }
  }
  return ret;
}

int TDecimate::findNumerator(double decRatio, int divisor) const
{
  int ret = -20;
  double den = (double)divisor, lowest = 5.0;
  double offset = 0.00000001;
  for (int x = 1; x < divisor; ++x)
  {
    double temp = ((double)x) / den;
    if (temp > decRatio + offset) continue;
    if (fabs(temp - decRatio) < lowest)
    {
      lowest = fabs(temp - decRatio);
      ret = x;
    }
  }
  return ret;
}

double TDecimate::findCorrectionFactors(double decRatio, int num, int den, int rc[10]) const
{
  double approx = ((double)num) / ((double)den);
  memset(rc, 0, 10 * sizeof(int));
  for (int x = 0; x < 10; ++x)
  {
    double error = decRatio - approx;
    if (error <= 0.0) break;
    double length = 1.0 / error;
    if (length > vi.numFrames) break;
    int multof = x == 0 ? den : rc[x - 1];
    rc[x] = (int)(length + 0.5);
    if (rc[x] % multof) rc[x] += multof - (rc[x] % multof);
    if (rc[x] > vi.numFrames) rc[x] = vi.numFrames;
    approx += 1.0 / ((double)rc[x]);
  }
  if ((1.0 / fabs(decRatio - approx)) < vi.numFrames)
      throw TIVTCError("TDecimate:  mode 2 error, unable to achieve a completely synced result!");
  return approx;
}

double TDecimate::buildDecStrategy()
{
  double frRatio = fps / rate;
  double rfRatio = rate / fps;
  if (rfRatio >= 99.0 / 100.0 || rfRatio <= 1.0 / 100.0)
      throw TIVTCError("TDecimate:  mode 2 error, unable to achieve desired decimation ratio!");
  double decRatio = 1.0 - rfRatio;
  if (frRatio < 3.0)
  {
    mode2_num = 1;
    mode2_den = findDivisor(decRatio, maxndl > 0 ? maxndl < 99 ? maxndl + 1 : 2 : 2);
  }
  else
  {
    mode2_den = (int)frRatio;
    mode2_num = findNumerator(decRatio, mode2_den);
    if (maxndl > 0 && maxndl < 99 && mode2_num - mode2_den < maxndl) mode2_den = mode2_num + maxndl;
  }
  if (mode2_den <= 0 || mode2_num <= 0 || mode2_num > 100 || mode2_den > 100 || mode2_num >= mode2_den)
      throw TIVTCError("TDecimate:  mode 2 invalid num and den results!");
  int clength = mode2_den, rc[10], arc[10];
  double aRate = fps*(1.0 - findCorrectionFactors(decRatio, mode2_num, mode2_den, rc));
  for (int x = 0; x < 10; ++x)
  {
    if (rc[x] > 0 && (rc[x] <= 100 || m2PA) && rc[x] > clength)
      clength = rc[x];
  }
  if (clength == mode2_den && rc[0] > 0 && (mode2_den - mode2_num <= 1 || mode2_den <= 25))
  {
    while (clength <= 50) clength *= 2;
  }
  int cdrop = ((int)(clength / mode2_den))*mode2_num;
  int rstart = -20;
  for (int x = 0; x < 10; ++x)
  {
    if (rc[x] > 0 && rc[x] > clength) { rstart = x; break; }
    if (rc[x] > 0 && rc[x] <= clength) cdrop += (int)(clength / rc[x]);
  }
  if (rstart == -20) rstart = 11;
  mode2_numCycles = (int)((double)vi.numFrames / (double)clength + 1.0);
  bool allMetrics = true;
  if (metricsArray.size())
  {
    for (int h = 0; h < vi.numFrames * 2; h += 2)
    {
      if (metricsArray[h] == UINT64_MAX) { allMetrics = false; break; }
    }
  }
  else allMetrics = false;
  if (aLUT.size()) aLUT.resize(0);
  if (allMetrics)
  {
    aLUT.resize((int)(vi.numFrames*rate / fps), 0);

    std::vector<int> orderT(vi.numFrames, 0);
    std::vector<uint64_t> metricsT(vi.numFrames, 0);
    memset(mode2_decA.data(), 0, vi.numFrames * sizeof(int));
    int ovrC = 0;
    if (ovrArray.size())
    {
      for (int i = 0; i < vi.numFrames; ++i)
      {
        if (ovrArray[i] & DROP_FRAME) ++ovrC;
      }
    }
    removeMinN(mode2_num, mode2_den, metricsT.data(), orderT.data(), ovrC);
    for (int x = 0; x < 10; ++x)
    {
      if (rc[x] > 0)
        removeMinN(1, rc[x], metricsT.data(), orderT.data(), ovrC);
    }
    int v = 0, tc = (int)(vi.numFrames*aRate / fps);
    for (int i = 0; i < vi.numFrames && v < tc; ++i)
    {
      if (mode2_decA[i] != 1)
      {
        aLUT[v] = i;
        ++v;
      }
    }
    mode2_decA.resize(0);
//    if (debug)
//    {
//      sprintf(buf, "drop count = %d  expected = %d\n", vi.numFrames - v,
//        vi.numFrames - (int)(vi.numFrames*aRate / fps));
//      OutputDebugString(buf);
//    }
    mode2_numCycles = -20;
  }
  else
  {
    aLUT.resize(mode2_numCycles * 5, 0);

    int temp = 0;
    for (int x = 0; x < 10; ++x)
    {
      if (rc[x] > 0 && rc[x] <= clength)
        temp |= (1 << x);
    }
    for (int x = 0; x < mode2_numCycles; ++x)
      aLUT[x * 5 + 4] = temp;
    memset(arc, 0, 10 * sizeof(int));
    int dropCount = 0;
    for (int x = 0; x < mode2_numCycles; ++x)
    {
      int add = (x + 1)*clength >= vi.numFrames ? vi.numFrames - x*clength : clength;
      aLUT[x * 5 + 0] = x*clength;
      aLUT[x * 5 + 1] = x*clength - dropCount;
      aLUT[x * 5 + 2] = x*clength + add;
      dropCount += add == clength ? cdrop : (int)(double(add*cdrop) / double(clength) + 0.5);
      for (int i = rstart; i < 10; ++i)
      {
        int mt = rc[i] >> 1;
        if (mt%clength) mt += clength - (mt%clength);
        if (rc[i] > 0 && arc[i] + add >= mt)
        {
          if (add >= ((clength + 1) >> 1))
          {
            ++dropCount;
            aLUT[x * 5 + 4] |= (1 << i);
          }
          arc[i] = mt - rc[i];
        }
        else if (rc[i] > 0) arc[i] += add;
      }
      aLUT[x * 5 + 3] = x*clength + add - dropCount;
    }
//    if (debug)
//    {
//      sprintf(buf, "drop count = %d  expected = %d\n", dropCount,
//        vi.numFrames - (int)(vi.numFrames*aRate / fps));
//      OutputDebugString(buf);
//    }
    if (clength != 5)
    {
      prev.setSize(clength);
      curr.setSize(clength);
      next.setSize(clength);
    }
    prev.length = curr.length = next.length = clength;
  }
  memcpy(mode2_cfs, rc, 10 * sizeof(int));
//  if (debug)
//  {
//    sprintf(buf, "rate = %f  actual rate = %f\n", rate, aRate);
//    OutputDebugString(buf);
//    sprintf(buf, "mode2_num = %d  mode2_den = %d  numCycles = %d  clength = %d\n", mode2_num, mode2_den, mode2_numCycles, clength);
//    OutputDebugString(buf);
//    for (int x = 0; x < 10; ++x)
//    {
//      if (mode2_cfs[x] <= 0) break;
//      sprintf(buf, "mode2_cfs %d = %d\n", x, mode2_cfs[x]);
//      OutputDebugString(buf);
//    }
//  }
  return aRate;
}
0707010000000D000081A4000000000000000000000001671240C900002645000000000000000000000000000000000000003600000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateMode7.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include <inttypes.h>
#include <algorithm>

const VSFrameRef * TDecimate::GetFrameMode7(int n, int activationReason, void **frameData, VSFrameContext *frameCtx, VSCore *core)
{
    if (activationReason != arInitial && activationReason != arAllFramesReady)
        return nullptr;

  double ratio = fps / rate;
  int prev_f = int(double(n - 1)*ratio + 1.0);
  if (prev_f < 0) prev_f = 0;
  int curr1_f = int(double(n)*ratio);
  if (curr1_f > nfrms) mode2_decA[n] = nfrms;
  int curr2_f = int(double(n)*ratio + 1.0);
  if (curr2_f > nfrms) mode2_decA[n] = nfrms;
  int next_f = int(double(n + 1)*ratio);
  if (next_f > nfrms) next_f = nfrms;
  int curr_real = mode2_decA[n];
  int chosen = 0;

  if (curr_real == -20) {
    int prev_real = n == 0 ? -20 : mode2_decA[n - 1];
    if (prev_real != -20) prev_f = prev_real;
    bool rup = double(n) * ratio - double(curr1_f) >= 0.5 ? true : false;
    bool requesting_frames = false;
    for (int i = std::max(prev_f - 3, 1); i <= std::min(next_f + 2, nfrms); ++i)
    {
      if (metricsOutArray[i << 1] == UINT64_MAX)
      {
        if (metricsArray.size() && metricsArray[i << 1] != UINT64_MAX)
          metricsOutArray[i << 1] = metricsArray[i << 1];
        else
        {
            if (activationReason == arInitial) {
                vsapi->requestFrameFilter(i - 1, child, frameCtx);
                vsapi->requestFrameFilter(i, child, frameCtx);
                requesting_frames = true;
            } else {
              int blockNI, blocksI;
              uint64_t metricF;
              const VSFrameRef *frame1 = vsapi->getFrameFilter(i - 1, child, frameCtx);
              const VSFrameRef *frame2 = vsapi->getFrameFilter(i, child, frameCtx);
              metricsOutArray[i << 1] =
                calcMetric(frame1, frame2,
                  vi_child, blockNI, blocksI, metricF, false, core);
              vsapi->freeFrame(frame1);
              vsapi->freeFrame(frame2);
            }
        }
      }
    }
    if (requesting_frames)
        return nullptr;

    try {
        if (same_group(curr1_f, curr2_f))
        {
          if (next_f - curr2_f > 1 && similar_group(prev_f, curr2_f) &&
            diff_group(next_f, next_f + 1) && diff_group(curr2_f, curr2_f + 1))
            chosen = 4;
          else chosen = 2;
        }
        else if (same_group(prev_f, curr1_f)) chosen = 1;
        else if (similar_group(prev_f, curr1_f))
        {
          if (similar_group(curr1_f, curr2_f) && !same_group(curr2_f, next_f))
            chosen = 3;
          else if (diff_group(curr1_f, curr2_f))
            chosen = 1;
        }
        else if (diff_group(prev_f, curr1_f))
        {
          if (diff_group(curr2_f, next_f)) chosen = 3;
          else if (diff_group(curr1_f, curr2_f) && same_group(curr1_f - 1, curr1_f) &&
            same_group(curr2_f, next_f) && diff_group(next_f, next_f + 1) &&
            curr1_f - prev_f == 2 && diff_group(prev_f - 1, prev_f))
            chosen = 1;
        }
    } catch (const TIVTCError &e) {
        vsapi->setFilterError(e.what(), frameCtx);
        return nullptr;
    }

    if (chosen == 4) mode2_decA[n] = curr2_f + 1;
    else if (chosen >= 2) // either
    {
      if ((chosen == 2 && rup) || (chosen == 3 &&
        ((metricsOutArray[curr2_f << 1] * 2 > metricsOutArray[curr1_f << 1] * 3) ||
          (rup && metricsOutArray[curr2_f << 1] * 3 >= metricsOutArray[curr1_f << 1] * 2))))
        mode2_decA[n] = curr2_f;
      else mode2_decA[n] = curr1_f;
    }
    else if (chosen == 0) mode2_decA[n] = curr1_f;
    else mode2_decA[n] = curr2_f;
  }

  int ret = mode2_decA[n];
  if (ret < 0 || ret > nfrms) {
      vsapi->setFilterError("TDecimate:  mode 7 internal error! Tried to request a frame that doesn't exist.", frameCtx);
      return nullptr;
  }

//  if (debug)
//  {
//    sprintf(buf, "TDecimate:  ------------------------------------------\n");
//    OutputDebugString(buf);
//    sprintf(buf, "TDecimate:  inframe = %d  useframe = %d  chosen = %d\n", n, ret, chosen);
//    OutputDebugString(buf);
//    sprintf(buf, "TDecimate:  prev = %d  curr1 = %d  curr2 = %d  next = %d\n", prev_f,
//      curr1_f, curr2_f, next_f);
//    OutputDebugString(buf);
//    for (int i = std::max(0, ret - 3); i <= std::min(ret + 3, nfrms); ++i)
//    {
//      sprintf(buf, "TDecimate:  %d:  %3.2f  %" PRIu64 "%s%s\n", i, double(metricsOutArray[i << 1])*100.0 / double(MAX_DIFF),
//        metricsOutArray[i << 1], metricsOutArray[i << 1] < same_thresh ? "  (D)" :
//        metricsOutArray[i << 1] > diff_thresh ? "  (N)" :
//        aLUT[i] == 2 ? "  (N)" : aLUT[i] == 1 ? "  (S)" :
//        aLUT[i] == 0 ? "  (D)" : "", wasChosen(i, n) ? "  *" : "");
//      OutputDebugString(buf);
//    }
//  }

  if (activationReason == arInitial || (activationReason == arAllFramesReady && (intptr_t)*frameData != RetFrameIsReady)) {
      vsapi->requestFrameFilter(ret, clip2, frameCtx);
      *frameData = (void *)RetFrameIsReady;
      return nullptr;
  }

  const VSFrameRef *src = vsapi->getFrameFilter(ret, clip2, frameCtx);

  if (display)
  {
    VSFrameRef *dst = vsapi->copyFrame(src, core);
    vsapi->freeFrame(src);

#define SZ 160
    char buf[SZ] = { 0 };

    std::string text = "TDecimate " VERSION " by tritical\n";

    snprintf(buf, SZ, "Mode: 7  Rate = %3.6f\n", rate);
    text += buf;
    snprintf(buf, SZ, "inframe = %d  useframe = %d  chosen = %d\n", n, ret, chosen);
    text += buf;
    snprintf(buf, SZ, "p = %d  c1 = %d  c2 = %d  n = %d\n", prev_f,
      curr1_f, curr2_f, next_f);
    text += buf;
    snprintf(buf, SZ, "dt = %3.2f  %" PRIu64 "  vt = %3.2f  %" PRIu64 "\n", dupThresh, same_thresh,
      vidThresh, diff_thresh);
    text += buf;

    for (int i = std::max(0, ret - 3); i <= std::min(ret + 3, nfrms); ++i)
    {
      snprintf(buf, SZ, "%d:  %3.2f  %" PRIu64 "%s%s\n", i, double(metricsOutArray[i << 1])*100.0 / double(MAX_DIFF),
        metricsOutArray[i << 1], metricsOutArray[i << 1] < same_thresh ? "  (D)" :
        metricsOutArray[i << 1] > diff_thresh ? "  (N)" :
        aLUT[i] == 2 ? "  (N)" : aLUT[i] == 1 ? "  (S)" :
        aLUT[i] == 0 ? "  (D)" : "", wasChosen(i, n) ? "  *" : "");
    text += buf;
    }
#undef SZ

    VSMap *props = vsapi->getFramePropsRW(dst);
    vsapi->propSetData(props, PROP_TDecimateDisplay, text.c_str(), text.size(), paReplace);

    return dst;
  }
  return src;
}

bool TDecimate::wasChosen(int i, int n)
{
  for (int p = std::max(n - 5, 0); p < std::min(n + 5, nfrmsN); ++p)
  {
    if (mode2_decA[p] == i) return true;
  }
  return false;
}

bool TDecimate::same_group(int f1, int f2)
{
  return diff_f(f1, f2) <= 0;
}

bool TDecimate::similar_group(int f1, int f2)
{
  return diff_f(f1, f2) <= 1;
}

bool TDecimate::diff_group(int f1, int f2)
{
  return diff_f(f1, f2) == 2;
}

int TDecimate::diff_f(int f1, int f2)
{
  int mx = 0;
  if (f2 < f1 || f2 < 0 || f1 > nfrms)
      throw TIVTCError("TDecimate:  mode 7 internal error (f2 < f1)!");
  if (f1 < 0) f1 = 0;
  if (f2 > nfrms) f2 = nfrms;
  if (f1 == f2)
  {
    if (aLUT[f2] == -20)
      aLUT[f2] = mode7_analysis(f2);
    return 0;
  }
  for (int i = f1 + 1; i <= f2; ++i)
  {
    if (aLUT[i] == -20) aLUT[i] = mode7_analysis(i);
    mx = std::max(mx, aLUT[i]);
  }
  return mx;
}

int TDecimate::mode7_analysis(int n) const
{
  uint64_t vals[3] = { UINT64_MAX, UINT64_MAX, UINT64_MAX };
  if (n == 0) return 2;
  vals[0] = metricsOutArray[(n - 1) << 1];
  vals[1] = metricsOutArray[n << 1];
  if (n != nfrms) vals[2] = metricsOutArray[(n + 1) << 1];
  if (n == nfrms)
  {
    if (vals[0] == UINT64_MAX || vals[1] == UINT64_MAX)
        throw TIVTCError("TDecimate:  mode 7 internal error (uncalculated metrics)!");
    if (vals[1] > diff_thresh || vals[1] * 2 > vals[0] * 3) return 2;
    else if (vals[1] < same_thresh || vals[1] * 4 < vals[0] ||
      (vals[1] * 2 < vals[0] && vals[0] > diff_thresh)) return 0;
    return 1;
  }
  if (vals[0] == UINT64_MAX || vals[1] == UINT64_MAX || vals[2] == UINT64_MAX)
      throw TIVTCError("TDecimate:  mode 7 internal error (uncalculated metrics)!");
  if (vals[1] > diff_thresh) return 2; // definitely different
  else if (vals[1] < same_thresh) return 0; // definitely the same
  else if (vals[1] < vals[0] && vals[1] < vals[2]) // local minimum difference
  {
    uint64_t minn = std::min(vals[0], vals[2]);
    if (vals[1] * 2 < minn && vals[0] > diff_thresh && vals[2] > diff_thresh) return 0;
    else if (vals[1] * 4 < minn) return 0;
  }
  else if (vals[1] > vals[0] && vals[1] > vals[2]) // local maximum difference
  {
    uint64_t maxn = std::max(vals[0], vals[2]);
    if (vals[1] * 2 > maxn * 3) return 2;
  }
  return 1;
}
0707010000000E000081A4000000000000000000000001671240C900002792000000000000000000000000000000000000003400000000vapoursynth-tivtc-2+2.g7abd4a3/src/TDecimateOut.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TDecimate.h"
#include <algorithm>

void TDecimate::debugOutput1(int n, bool film, int blend)
{
//  if (mode == 0 || (mode == 3 && vfrDec == 0))
//  {
//    snprintf(buf, SZ, "TDecimate:  %d: ", curr.frame);
//    formatMetrics(curr);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", curr.frame);
//    formatMatches(curr, prev);
//    OutputDebugString(buf);
//  }
//  else
//  {
//    snprintf(buf, SZ, "TDecimate:  %d: ", prev.frame);
//    formatMetrics(prev);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", prev.frame);
//    formatDups(prev);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", prev.frame);
//    formatMatches(prev, prev);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", curr.frame);
//    formatMetrics(curr);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", curr.frame);
//    formatDups(curr);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", curr.frame);
//    formatMatches(curr, prev);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", next.frame);
//    formatMetrics(next);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", next.frame);
//    formatDups(next);
//    OutputDebugString(buf);
//    snprintf(buf, SZ, "TDecimate:  %d: ", next.frame);
//    formatMatches(next, curr);
//    OutputDebugString(buf);
//  }
//  if (film)
//  {
//    if (cycleR > 1 || blend == 3) snprintf(buf, SZ, "TDecimate:  %d:  Dropping Frames:", n);
//    else snprintf(buf, SZ, "TDecimate:  %d:  Dropping Frame:", n);
//    formatDecs(curr);
//  }
//  else snprintf(buf, SZ, "TDecimate:  %d:  VIDEO", n);
//  OutputDebugString(buf);
}

void TDecimate::debugOutput2(int n, int ret, bool film, int f1, int f2, double amount1,
  double amount2)
{
//  if (amount1 == 0.0 && amount2 == 0.0)
//    snprintf(buf, SZ, "TDecimate:  inframe = %d  useframe = %d\n", n, ret);
//  else snprintf(buf, SZ, "TDecimate:  inframe: %d  useframe = blend %d-%d (%3.2f,%3.2f)\n",
//    n, f1, f2, amount1*100.0, amount2*100.0);
//  OutputDebugString(buf);
}

//void TDecimate::formatMetrics(Cycle &current)
//{
//  char tempBuf[40];
//  for (int i = current.cycleS; i < current.cycleE; ++i)
//  {
//    sprintf(tempBuf, " %3.2f", current.diffMetricsN[i]);
//    strcat(buf, tempBuf);
//  }
//  strcat(buf, "\n");
//}

//void TDecimate::formatDups(Cycle &current)
//{
//  char tempBuf[40];
//  for (int i = current.cycleS; i < current.cycleE; ++i)
//  {
//    sprintf(tempBuf, " %d", current.dupArray[i]);
//    strcat(buf, tempBuf);
//  }
//  strcat(buf, "\n");
//}

void TDecimate::formatDecs(std::string &buf, Cycle &current)
{
  char tempBuf[40];
  int i = current.cycleS, b = current.frameSO;
  for (; i < current.cycleE; ++i, ++b)
  {
    if (current.decimate[i] == 1)
    {
      sprintf(tempBuf, " %d", b);
      buf += tempBuf;
    }
  }
}

//void TDecimate::formatMatches(Cycle &current)
//{
//  char tempBuf[40];
//  for (int i = current.cycleS; i < current.cycleE; ++i)
//  {
//    if (current.match[i] >= 0)
//      sprintf(tempBuf, " %c  %d", MTC(current.match[i]), current.filmd2v[i]);
//    else
//      sprintf(tempBuf, " %c", MTC(current.match[i]));
//    strcat(buf, tempBuf);
//  }
//  strcat(buf, "\n");
//}

//void TDecimate::formatMatches(Cycle &current, Cycle &previous)
//{
//  char tempBuf[40];
//  int mp;
//  if (previous.frame != current.frame)
//    mp = previous.cycleE > 0 ? previous.match[previous.cycleE - 1] : -20;
//  else mp = -20;
//  int mc = current.match[current.cycleS];
//  for (int i = current.cycleS; i < current.cycleE; ++i)
//  {
//    sprintf(tempBuf, " %c", MTC(mc));
//    strcat(buf, tempBuf);
//    if (mc >= 0)
//    {
//      if (checkMatchDup(mp, mc))
//      {
//        sprintf(tempBuf, " (%s)", "mdup");
//        strcat(buf, tempBuf);
//      }
//      if (current.filmd2v[i] == 1)
//      {
//        sprintf(tempBuf, " (%s)", "d2vdup");
//        strcat(buf, tempBuf);
//      }
//    }
//    mp = mc;
//    if (i < current.cycleE - 1) mc = current.match[i + 1];
//  }
//  strcat(buf, "\n");
//}

void TDecimate::addMetricCycle(const Cycle &j)
{
  if (metricsOutArray.size() == 0) return;
  int i = j.cycleS, p = j.frameSO;
  for (; i < j.cycleE; ++i, ++p)
  {
    metricsOutArray[p << 1] = j.diffMetricsU[i];
    metricsOutArray[(p << 1) + 1] = j.diffMetricsUF[i];
  }
}

void TDecimate::displayOutput(VSFrameRef *dst, int n,
  int ret, bool film, double amount1, double amount2, int f1, int f2)
{
//  int y = 0;
#define SZ 160
  char buf[SZ], tempBuf[40];

  std::string text = "TDecimate " VERSION " by tritical\n";

//  constexpr auto FONT_WIDTH = 10; // info_h
//  constexpr auto FONT_HEIGHT = 20; // info_h
//  const int MAX_X = vi_disp->width / FONT_WIDTH;
//  const int MAX_Y = vi_disp->height / FONT_HEIGHT;


  snprintf(buf, SZ, "Mode: %d  Cycle: %d  CycleR: %d  Hybrid: %d\n", mode, cycle, cycleR, hybrid);
  text += buf;

  if (amount1 == 0.0 && amount2 == 0.0)
    snprintf(buf, SZ, "inframe: %d  useframe: %d\n", n, ret);
  else snprintf(buf, SZ, "inframe: %d  useframe: blend %d-%d (%3.2f,%3.2f)\n", n, f1, f2,
    amount1*100.0, amount2*100.0);
  text += buf;

//  int y_saved = y;
//  int current_column_x = 0;
//  int max_column_width = 0;

  if (mode == 0 || (mode == 3 && vfrDec == 0))
  {
    int mp = prev.frame != -20 ? prev.match[prev.cycleE - 1] : -20;
    int mc = curr.match[curr.cycleS];
    for (int x = curr.cycleS; x < curr.cycleE; ++x)
    {
      snprintf(buf, SZ, "%d%s%3.2f", curr.frame + x, curr.decimate[x] == 1 ? ":**" : ":  ",
        curr.diffMetricsN[x]);
      if (mc >= 0)
      {
        sprintf(tempBuf, " %c", MTC(mc));
        strcat(buf, tempBuf);
        if (checkMatchDup(mp, mc))
        {
          sprintf(tempBuf, " (%s)", "mdup");
          strcat(buf, tempBuf);
        }
        if (curr.filmd2v[x] == 1)
        {
          sprintf(tempBuf, " (%s)", "d2vdup");
          strcat(buf, tempBuf);
        }
      }

//      int len = (int)strlen(buf);

      text += buf;
      text += "\n";

//      Draw(dst, current_column_x, y++, buf, vi_disp);
      // retd is 
      // >=0: column width printed 
      // -1 if does not fit vertically 
      // (-2-length_written) if does not fit horizontally
//      if (y >= MAX_Y)
//      {
        // does not fit vertically
//        current_column_x += max_column_width + 2; // make x to next column, leaving a gap
//        max_column_width = 0; // reset width counter
//        y = y_saved; // back to the top of the area
//        Draw(dst, current_column_x, y++, buf, vi_disp);
//      }
//      else
//        max_column_width = std::max(max_column_width, len); // get max width so far in current column
      mp = mc;
      if (x < curr.cycleE - 1) mc = curr.match[x + 1];
    }
  }
  else
  {
    int mp = prev.frame != -20 ? prev.match[prev.cycleE - 1] : -20;
    int mc = curr.match[curr.cycleS];
    for (int x = curr.cycleS; x < curr.cycleE; ++x)
    {
      snprintf(buf, SZ, "%d%s%3.2f", curr.frame + x, curr.decimate[x] == 1 ? ":**" : ":  ",
        curr.diffMetricsN[x]);
      if (mc >= 0)
      {
        sprintf(tempBuf, " %c", MTC(mc));
        strcat(buf, tempBuf);
      }
      sprintf(tempBuf, " %s", curr.dupArray[x] == 1 ? "(dup)" : "(new)");
      strcat(buf, tempBuf);
      if (mc >= 0)
      {
        if (checkMatchDup(mp, mc))
        {
          sprintf(tempBuf, " (%s)", "mdup");
          strcat(buf, tempBuf);
        }
        if (curr.filmd2v[x] == 1)
        {
          sprintf(tempBuf, " (%s)", "d2vdup");
          strcat(buf, tempBuf);
        }
      }

//      int len = (int)strlen(buf);

      text += buf;
      text += "\n";

//      Draw(dst, current_column_x, y++, buf, vi_disp);
//      if (y >= MAX_Y)
//      {
//        current_column_x += max_column_width + 2;
//        max_column_width = 0;
//        y = y_saved;
//        Draw(dst, current_column_x, y++, buf, vi_disp);
//      }
//      else
//        max_column_width = std::max(max_column_width, len);
      mp = mc;
      if (x < curr.cycleE - 1) mc = curr.match[x + 1];
    }
  }
  if (film)
  {
    text += "FILM, Drop:";
    formatDecs(text, curr);
    text += "\n";
  }
  else text += "VIDEO\n";

//  int len = (int)strlen(buf);

//  Draw(dst, current_column_x, y++, buf, vi_disp);
//  if (y >= MAX_Y)
//  {
//    y = y_saved;
//    current_column_x += max_column_width + 2;
//    Draw(dst, current_column_x, y++, buf, vi_disp);
//  }

//  int length_available = (MAX_X - current_column_x);
//  int buf_offset = length_available;
//  len -= length_available;

  // print rest buffer in a line-wrapped style
//  while (y < MAX_Y && len > 0)
//  {
//    Draw(dst, current_column_x, y++, buf + buf_offset, vi_disp);
//    buf_offset += length_available;
//    len -= length_available;
//  }
#undef SZ

    VSMap *props = vsapi->getFramePropsRW(dst);
    vsapi->propSetData(props, PROP_TDecimateDisplay, text.c_str(), text.size(), paReplace);
}
0707010000000F000081A4000000000000000000000001671240C90001EF0F000000000000000000000000000000000000002B00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFM.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <cstring>

#include "TFM.h"
#include "TFMasm.h"
#include "TCommonASM.h"

enum _FieldBased {
    Progressive = 0,
    BottomFieldFirst = 1,
    TopFieldFirst = 2
};

const VSFrameRef *TFM::GetFrame(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core)
{
  if (n < 0) n = 0;
  else if (n > nfrms) n = nfrms;

  if (activationReason == arInitial) {
      vsapi->requestFrameFilter(std::max(0, n - 1), child, frameCtx);
      vsapi->requestFrameFilter(n, child, frameCtx);
      vsapi->requestFrameFilter(std::min(n + 1, nfrms), child, frameCtx);
      return nullptr;
  } else if (activationReason != arAllFramesReady) {
      return nullptr;
  }

  const VSFrameRef *prv = vsapi->getFrameFilter(std::max(0, n - 1), child, frameCtx);
  const VSFrameRef *src = vsapi->getFrameFilter(n, child, frameCtx);
  const VSFrameRef *nxt = vsapi->getFrameFilter(std::min(n + 1, nfrms), child, frameCtx);

  int dfrm = -20, tfrm = -20;
  int mmatch1, nmatch1, nmatch2, mmatch2, fmatch, tmatch;
  int combed = -1, tcombed = -1, xblocks = -20;
  bool d2vfilm = false, d2vmatch = false, isSC = true;
  int mics[5] = { -20, -20, -20, -20, -20 };
  int blockN[5] = { -20, -20, -20, -20, -20 };
  order = order_origSaved;
  mode = mode_origSaved;
  field = field_origSaved;
  PP = PP_origSaved;
  MI = MI_origSaved;
  getSettingOvr(n); // process overrides

  const VSMap *props = vsapi->getFramePropsRO(src);
  int err;

  if (order == -1) {
      int64_t field_based = vsapi->propGetInt(props, "_FieldBased", 0, &err);
      if (err) { // prop not present
          vsapi->setFilterError("TFM: Couldn't find the '_FieldBased' frame property. The 'order' parameter must be used.", frameCtx);
          vsapi->freeFrame(prv);
          vsapi->freeFrame(src);
          vsapi->freeFrame(nxt);
          return nullptr;
      }

      /// Pretend it's top field first when it says progressive?
      order = (field_based == TopFieldFirst || field_based == Progressive);
//      order = child->GetParity(n) ? 1 : 0;
  }
  if (field == -1) field = order;
  int frstT = field^order ? 2 : 0;
  int scndT = (mode == 2 || mode == 6) ? (field^order ? 3 : 4) : (field^order ? 0 : 2);

  VSFrameRef *dst = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src, core);
  VSFrameRef *tmp = vsapi->newVideoFrame(vi->format, vi->width, vi->height, nullptr, core);

//  if (debug)
//  {
//    sprintf(buf, "TFM:  ----------------------------------------\n");
//    OutputDebugString(buf);
//  }
  if (getMatchOvr(n, fmatch, combed, d2vmatch,
    flags == 5 ? checkSceneChange(prv, src, nxt, n) : false))
  {
    createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
    if (PP > 0 && combed == -1)
    {
      if (checkCombed(dst, n, fmatch, blockN, xblocks, mics, false))
      {
        if (d2vmatch)
        {
          d2vmatch = false;
          for (int j = 0; j < 5; ++j)
            mics[j] = -20;
          goto d2vCJump;
        }
        else combed = 2;
      }
      else combed = 0;
    }
    d2vfilm = d2vduplicate(fmatch, combed, n);
    if (micout > 0)
    {
      for (int i = 0; i < 5; ++i)
      {
        if (mics[i] == -20 && (i < 3 || micout > 1))
        {
          createWeaveFrame(tmp, prv, src, nxt, i, tfrm);
          checkCombed(tmp, n, i, blockN, xblocks, mics, true);
        }
      }
    }
    fileOut(fmatch, combed, d2vfilm, n, mics[fmatch], mics);
    if (display) writeDisplay(dst, n, fmatch, combed, true, blockN[fmatch], xblocks,
      d2vmatch, mics, prv, src, nxt);
//    if (debug)
//    {
//      char buft[20];
//      if (mics[fmatch] < 0) sprintf(buft, "N/A");
//      else sprintf(buft, "%d", mics[fmatch]);
//      sprintf(buf, "TFM:  frame %d  - final match = %c %s  MIC = %s  (OVR)\n", n, MTC(fmatch),
//        d2vmatch ? "(D2V)" : "", buft);
//      OutputDebugString(buf);
//      if (micout > 0)
//      {
//        if (micout > 1)
//          sprintf(buf, "TFM:  frame %d  - mics: p = %d  c = %d  n = %d  b = %d  u = %d\n",
//            n, mics[0], mics[1], mics[2], mics[3], mics[4]);
//        else
//          sprintf(buf, "TFM:  frame %d  - mics: p = %d  c = %d  n = %d\n",
//            n, mics[0], mics[1], mics[2]);
//        OutputDebugString(buf);
//      }
//      sprintf(buf, "TFM:  frame %d  - mode = %d  field = %d  order = %d  d2vfilm = %c\n", n, mode, field, order,
//        d2vfilm ? 'T' : 'F');
//      OutputDebugString(buf);
//      if (combed != -1)
//      {
//        if (combed == 1) sprintf(buf, "TFM:  frame %d  - CLEAN FRAME  (forced!)\n", n);
//        else if (combed == 5) sprintf(buf, "TFM:  frame %d  - COMBED FRAME  (forced!)\n", n);
//        else if (combed == 0) sprintf(buf, "TFM:  frame %d  - CLEAN FRAME\n", n);
//        else sprintf(buf, "TFM:  frame %d  - COMBED FRAME\n", n);
//        OutputDebugString(buf);
//      }
//    }
    if (usehints || PP >= 2) putFrameProperties(dst, fmatch, combed, d2vfilm, mics);
    lastMatch.frame = n;
    lastMatch.match = fmatch;
    lastMatch.field = field;
    lastMatch.combed = combed;
    vsapi->freeFrame(prv);
    vsapi->freeFrame(src);
    vsapi->freeFrame(nxt);
    vsapi->freeFrame(tmp);
    return dst;
  }
d2vCJump:
  if (mode == 6)
  {
    int thrdT = field^order ? 0 : 2;
    int frthT = field^order ? 4 : 3;
    tcombed = 0;
    if (!slow) fmatch = compareFields(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    else fmatch = compareFieldsSlow(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    if (micmatching > 0)
      checkmm(fmatch, 1, frstT, dst, dfrm, tmp, tfrm, prv, src, nxt, n, blockN, xblocks, mics);
    createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
    if (checkCombed(dst, n, fmatch, blockN, xblocks, mics, false))
    {
      tcombed = 2;
      if (ubsco) isSC = checkSceneChange(prv, src, nxt, n);
      if (isSC) createWeaveFrame(tmp, prv, src, nxt, scndT, tfrm);
      if (isSC && !checkCombed(tmp, n, scndT, blockN, xblocks, mics, false))
      {
        fmatch = scndT;
        tcombed = 0;
        copyFrame(dst, tmp, vsapi);
        dfrm = fmatch;
      }
      else
      {
        createWeaveFrame(tmp, prv, src, nxt, thrdT, tfrm);
        if (!checkCombed(tmp, n, thrdT, blockN, xblocks, mics, false))
        {
          fmatch = thrdT;
          tcombed = 0;
          copyFrame(dst, tmp, vsapi);
          dfrm = fmatch;
        }
        else
        {
          if (isSC) createWeaveFrame(tmp, prv, src, nxt, frthT, tfrm);
          if (isSC && !checkCombed(tmp, n, frthT, blockN, xblocks, mics, false))
          {
            fmatch = frthT;
            tcombed = 0;
            copyFrame(dst, tmp, vsapi);
            dfrm = fmatch;
          }
        }
      }
    }
    if (combed == -1 && PP > 0) combed = tcombed;
  }
  else if (mode == 7)
  {
//    if (debug && lastMatch.frame != n && n != 0)
//    {
//      sprintf(buf, "TFM:  mode 7 - non-linear access detected!\n");
//      OutputDebugString(buf);
//    }
    combed = 0;
    bool combed1 = false, combed2 = false;
    if (!slow) fmatch = compareFields(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    else fmatch = compareFieldsSlow(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    createWeaveFrame(dst, prv, src, nxt, 1, dfrm);
    combed1 = checkCombed(dst, n, 1, blockN, xblocks, mics, false);
    createWeaveFrame(dst, prv, src, nxt, frstT, dfrm);
    combed2 = checkCombed(dst, n, frstT, blockN, xblocks, mics, false);
    if (!combed1 && !combed2)
    {
      createWeaveFrame(dst, prv, src, nxt,fmatch, dfrm);
      if (field == 0) mode7_field = 1;
      else mode7_field = 0;
    }
    else if (!combed2 && combed1)
    {
      createWeaveFrame(dst, prv, src, nxt, frstT, dfrm);
      mode7_field = 1;
      fmatch = frstT;
    }
    else if (!combed1 && combed2)
    {
      createWeaveFrame(dst, prv, src, nxt, 1, dfrm);
      mode7_field = 0;
      fmatch = 1;
    }
    else
    {
      createWeaveFrame(dst, prv, src, nxt, 1, dfrm);
      combed = 2;
      field = mode7_field;
      fmatch = 1;
    }
  }
  else
  {
    if (!slow) 
      fmatch = compareFields(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    else 
      fmatch = compareFieldsSlow(prv, src, nxt, 1, frstT, nmatch1, nmatch2, mmatch1, mmatch2, n);
    if (micmatching > 0)
      checkmm(fmatch, 1, frstT, dst, dfrm, tmp, tfrm, prv, src, nxt, n, blockN, xblocks, mics);
    createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
    if (mode > 3 || (mode > 0 && checkCombed(dst, n, fmatch, blockN, xblocks, mics, false)))
    {
      if (mode < 4) tcombed = 2;
      if (mode != 2)
      {
        if (!slow) 
          tmatch = compareFields(prv, src, nxt, fmatch, scndT, nmatch1, nmatch2, mmatch1, mmatch2, n);
        else 
          tmatch = compareFieldsSlow(prv, src, nxt, fmatch, scndT, nmatch1, nmatch2, mmatch1, mmatch2, n);
        if (micmatching > 0)
          checkmm(tmatch, fmatch, scndT, dst, dfrm, tmp, tfrm, prv, src, nxt, n, blockN, xblocks, mics);
        createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
      }
      else tmatch = scndT;
      if (tmatch == scndT)
      {
        if (mode > 3)
        {
          fmatch = tmatch;
          createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
        }
        else if (mode != 2 || !ubsco || checkSceneChange(prv, src, nxt, n))
        {
          createWeaveFrame(tmp, prv, src, nxt, tmatch, tfrm);
          if (!checkCombed(tmp, n, tmatch, blockN, xblocks, mics, false))
          {
            fmatch = tmatch;
            tcombed = 0;
            copyFrame(dst, tmp, vsapi);
            dfrm = fmatch;
          }
        }
      }
      if ((mode == 3 && tcombed == 2) || (mode == 5 && checkCombed(dst, n, fmatch, blockN, xblocks, mics, false)))
      {
        tcombed = 2;
        if (!ubsco || checkSceneChange(prv, src, nxt, n))
        {
          if (!slow) 
            tmatch = compareFields(prv, src, nxt, 3, 4, nmatch1, nmatch2, mmatch1, mmatch2, n);
          else 
            tmatch = compareFieldsSlow(prv, src, nxt, 3, 4, nmatch1, nmatch2, mmatch1, mmatch2, n);
          if (micmatching > 0)
            checkmm(tmatch, 3, 4, dst, dfrm, tmp, tfrm, prv, src, nxt, n, blockN, xblocks, mics);
          createWeaveFrame(tmp, prv, src, nxt, tmatch, tfrm);
          if (!checkCombed(tmp, n, tmatch, blockN, xblocks, mics, false))
          {
            fmatch = tmatch;
            tcombed = 0;
            copyFrame(dst, tmp, vsapi);
            dfrm = fmatch;
          }
          else
            createWeaveFrame(dst, prv, src, nxt, fmatch, dfrm);
        }
      }
      if (mode == 5 && tcombed == -1) tcombed = 0;
    }
    if ((mode == 1 || mode == 2 || mode == 3) && tcombed == -1) tcombed = 0;
    if (combed == -1 && PP > 0) combed = tcombed;
    if (PP > 0 && combed == -1)
    {
      if (checkCombed(dst, n, fmatch, blockN, xblocks, mics, false)) combed = 2;
      else combed = 0;
    }
    if (dfrm != fmatch) {
        vsapi->setFilterError("TFM: internal error (dfrm!=fmatch). Please report this.", frameCtx);
        vsapi->freeFrame(prv);
        vsapi->freeFrame(src);
        vsapi->freeFrame(nxt);
        vsapi->freeFrame(dst);
        vsapi->freeFrame(tmp);
        return nullptr;
    }
  }
  if (micout > 0 || (micmatching > 0 && mics[fmatch] > 15 && mode != 7 && !(micmatching == 2 && (mode == 0 || mode == 4))
    && (!mmsco || checkSceneChange(prv, src, nxt, n))))
  {
    for (int i = 0; i < 5; ++i)
    {
      if (mics[i] == -20 && (i < 3 || micout > 1 || micmatching > 0))
      {
        createWeaveFrame(tmp, prv, src, nxt, i, tfrm);
        checkCombed(tmp, n, i, blockN, xblocks, mics, true);
      }
    }
    if (micmatching > 0 && mode != 7 && mics[fmatch] > 15 &&
      (!mmsco || checkSceneChange(prv, src, nxt, n)))
    {
      int i, j, temp1, temp2, order1[5], order2[5] = { 0, 1, 2, 3, 4 };
      for (i = 0; i < 5; ++i) order1[i] = mics[i];
      for (i = 1; i < 5; ++i)
      {
        j = i;
        temp1 = order1[j];
        temp2 = order2[j];
        while (j > 0 && order1[j - 1] > temp1)
        {
          order1[j] = order1[j - 1];
          order2[j] = order2[j - 1];
          --j;
        }
        order1[j] = temp1;
        order2[j] = temp2;
      }
      if (micmatching == 1)
      {
      othertest:
        if (order1[0] * 3 < order1[1] && abs(order1[0] - order1[1]) > 15 &&
          order1[0] < MI && order2[0] != fmatch &&
          (((field^order) && (order2[0] == 1 || order2[0] == 2 || order2[0] == 3)) ||
          (!(field^order) && (order2[0] == 0 || order2[0] == 1 || order2[0] == 4))))
        {
          bool xfield = (field^order) == 0 ? false : true;
          int lmatch = lastMatch.frame == n - 1 ? lastMatch.match : -20;
          if (!((order2[0] == 4 && lmatch == 0 && !xfield && (order2[1] == 0 || order2[2] == 0)) ||
            (order2[0] == 3 && lmatch == 2 && xfield && (order2[1] == 2 || order2[2] == 2))))
          {
            micChange(n, fmatch, order2[0], dst, prv, src, nxt,
              fmatch, combed, dfrm);
          }
        }
        if (order1[0] * 4 < order1[1] && abs(order1[0] - order1[1]) > 30 &&
          order1[0] < MI && order1[1] >= MI && order2[0] != fmatch)
        {
          micChange(n, fmatch, order2[0], dst, prv, src, nxt,
            fmatch, combed, dfrm);
        }
      }
      else if (micmatching == 2 || micmatching == 3)
      {
        int try1 = field^order ? 2 : 0, try2, minm, mint, try3, try4;
        if (mode == 1) // p/c + n
        {
          try2 = try1 == 2 ? 0 : 2;
          minm = std::min(mics[1], mics[try1]);
          if (mics[try2] * 3 < minm && mics[try2] < MI && abs(mics[try2] - minm) >= 30 && try2 != fmatch)
            micChange(n, fmatch, try2, dst, prv, src, nxt,
              fmatch, combed, dfrm);
        }
        else if (mode == 2) // p/c + u
        {
          try2 = try1 == 2 ? 3 : 4;
          minm = std::min(mics[1], mics[try1]);
          if (mics[try2] * 3 < minm && mics[try2] < MI && abs(mics[try2] - minm) >= 30 && try2 != fmatch)
            micChange(n, fmatch, try2, dst, prv, src, nxt,
              fmatch, combed, dfrm);
        }
        else if (mode == 3) // p/c + n + u/b
        {
          try2 = try1 == 2 ? 0 : 2;
          minm = std::min(mics[1], mics[try1]);
          mint = std::min(mics[3], mics[4]);
          try3 = try1 == 2 ? (mint == mics[3] ? 3 : 4) : (mint == mics[4] ? 4 : 3);
          if (mics[try2] * 3 < minm && mics[try2] < MI && abs(mics[try2] - minm) >= 30 && try2 != fmatch &&
            fmatch != 3 && fmatch != 4)
          {
            micChange(n, fmatch, try2, dst, prv, src, nxt,
              fmatch, combed, dfrm);
            minm = mics[try2];
          }
          else if (fmatch == try2) minm = std::min(mics[try2], minm);
          if (mint * 3 < minm && mint < MI && abs(mint - minm) >= 30 && fmatch != 3 && fmatch != 4)
            micChange(n, fmatch, try3, dst, prv, src, nxt,
              fmatch, combed, dfrm);
        }
        else if (mode == 5) // p/c/n + u/b
        {
          minm = std::min(mics[0], std::min(mics[1], mics[2]));
          mint = std::min(mics[3], mics[4]);
          try3 = try1 == 2 ? (mint == mics[3] ? 3 : 4) : (mint == mics[4] ? 4 : 3);
          if (mint * 3 < minm && mint < MI && abs(mint - minm) >= 30 && fmatch != 3 && fmatch != 4)
            micChange(n, fmatch, try3, dst, prv, src, nxt,
              fmatch, combed, dfrm);
        }
        else if (mode == 6) // p/c + u + n + b
        {
          try2 = try1 == 2 ? 3 : 4;
          try3 = try1 == 2 ? 0 : 2;
          try4 = try2 == 3 ? 4 : 3;
          minm = std::min(mics[1], mics[try1]);
          if (mics[try2] * 3 < minm && mics[try2] < MI && abs(mics[try2] - minm) >= 30 && fmatch != try2 &&
            fmatch != try3 && fmatch != try4)
          {
            micChange(n, fmatch, try2, dst, prv, src, nxt,
              fmatch, combed, dfrm);
            minm = mics[try2];
          }
          else if (fmatch == try2) minm = std::min(mics[try2], minm);
          if (mics[try3] * 3 < minm && mics[try3] < MI && abs(mics[try3] - minm) >= 30 && fmatch != try3 &&
            fmatch != try4)
          {
            micChange(n, fmatch, try3, dst, prv, src, nxt,
              fmatch, combed, dfrm);
            minm = mics[try3];
          }
          else if (fmatch == try3) minm = std::min(mics[try3], minm);
          if (mics[try4] * 3 < minm && mics[try4] < MI && abs(mics[try4] - minm) >= 30 && fmatch != try4)
            micChange(n, fmatch, try4, dst, prv, src, nxt,
              fmatch, combed, dfrm);
        }
        if (micmatching == 3) { goto othertest; }
      }
    }
  }
  d2vfilm = d2vduplicate(fmatch, combed, n);
  fileOut(fmatch, combed, d2vfilm, n, mics[fmatch], mics);
  if (display) writeDisplay(dst, n, fmatch, combed, false, blockN[fmatch], xblocks,
    d2vmatch, mics, prv, src, nxt);
//  if (debug)
//  {
//    char buft[20];
//    if (mics[fmatch] < 0) sprintf(buft, "N/A");
//    else sprintf(buft, "%d", mics[fmatch]);
//    sprintf(buf, "TFM:  frame %d  - final match = %c  MIC = %s\n", n, MTC(fmatch), buft);
//    OutputDebugString(buf);
//    if (micout > 0 || (micmatching > 0 && mics[0] != -20 && mics[1] != -20 && mics[2] != -20
//      && mics[3] != -20 && mics[4] != -20))
//    {
//      if (micout > 1 || micmatching > 0)
//        sprintf(buf, "TFM:  frame %d  - mics: p = %d  c = %d  n = %d  b = %d  u = %d\n",
//          n, mics[0], mics[1], mics[2], mics[3], mics[4]);
//      else
//        sprintf(buf, "TFM:  frame %d  - mics: p = %d  c = %d  n = %d\n",
//          n, mics[0], mics[1], mics[2]);
//      OutputDebugString(buf);
//    }
//    sprintf(buf, "TFM:  frame %d  - mode = %d  field = %d  order = %d  d2vfilm = %c\n", n, mode, field, order,
//      d2vfilm ? 'T' : 'F');
//    OutputDebugString(buf);
//    if (combed != -1)
//    {
//      if (combed == 1) sprintf(buf, "TFM:  frame %d  - CLEAN FRAME  (forced!)\n", n);
//      else if (combed == 5) sprintf(buf, "TFM:  frame %d  - COMBED FRAME  (forced!)\n", n);
//      else if (combed == 0) sprintf(buf, "TFM:  frame %d  - CLEAN FRAME\n", n);
//      else sprintf(buf, "TFM:  frame %d  - COMBED FRAME\n", n);
//      OutputDebugString(buf);
//    }
//  }
  if (usehints || PP >= 2) putFrameProperties(dst, fmatch, combed, d2vfilm, mics);
  lastMatch.frame = n;
  lastMatch.match = fmatch;
  lastMatch.field = field;
  lastMatch.combed = combed;

  vsapi->freeFrame(prv);
  vsapi->freeFrame(src);
  vsapi->freeFrame(nxt);
  vsapi->freeFrame(tmp);
  return dst;
}

void TFM::checkmm(int &cmatch, int m1, int m2, VSFrameRef *dst, int &dfrm, VSFrameRef *tmp, int &tfrm,
  const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int n,
  int *blockN, int &xblocks, int *mics)
{
  if (cmatch != m1)
  {
    int tx = m1;
    m1 = m2;
    m2 = tx;
  }
  if (dfrm == m1)
    checkCombed(dst, n, m1, blockN, xblocks, mics, false);
  else if (tfrm == m1)
    checkCombed(tmp, n, m1, blockN, xblocks, mics, false);
  else
  {
    if (tfrm != m2)
    {
      createWeaveFrame(tmp, prv, src, nxt, m1, tfrm);
      checkCombed(tmp, n, m1, blockN, xblocks, mics, false);
    }
    else
    {
      createWeaveFrame(dst, prv, src, nxt, m1, dfrm);
      checkCombed(dst, n, m1, blockN, xblocks, mics, false);
    }
  }
  if (mics[m1] < 30)
    return;
  if (dfrm == m2)
    checkCombed(dst, n, m2, blockN, xblocks, mics, false);
  else if (tfrm == m2)
    checkCombed(tmp, n, m2, blockN, xblocks, mics, false);
  else
  {
    if (tfrm != m1)
    {
      createWeaveFrame(tmp, prv, src, nxt, m2, tfrm);
      checkCombed(tmp, n, m2, blockN, xblocks, mics, false);
    }
    else
    {
      createWeaveFrame(dst, prv, src, nxt, m2, dfrm);
      checkCombed(dst, n, m2, blockN, xblocks, mics, false);
    }
  }
  if ((mics[m2] * 3 < mics[m1] || (mics[m2] * 2 < mics[m1] && mics[m1] > MI)) &&
    abs(mics[m2] - mics[m1]) >= 30 && mics[m2] < MI)
  {
//    if (debug)
//    {
//      sprintf(buf, "TFM:  frame %d  - micmatching override:  %c (%d) to %c (%d)\n", n,
//        MTC(m1), mics[m1], MTC(m2), mics[m2]);
//      OutputDebugString(buf);
//    }
    cmatch = m2;
  }
}

void TFM::micChange(int n, int m1, int m2, VSFrameRef *dst, const VSFrameRef *prv,
  const VSFrameRef *src, const VSFrameRef *nxt, int &fmatch,
  int &combed, int &cfrm) const
{
//  if (debug)
//  {
//    sprintf(buf, "TFM:  frame %d  - micmatching override:  %c to %c\n", n,
//      MTC(m1), MTC(m2));
//    OutputDebugString(buf);
//  }
  fmatch = m2;
  combed = 0;
  createWeaveFrame(dst, prv, src, nxt, m2, cfrm);
}

void TFM::writeDisplay(VSFrameRef *dst, int n, int fmatch, int combed, bool over,
  int blockN, int xblocks, bool d2vmatch, int *mics, const VSFrameRef *prv,
  const VSFrameRef *src, const VSFrameRef *nxt)
{
    // Doesn't actually display anything, just sets a frame property which text.Text will display.

#define SZ 160
    char buf[SZ];

  if (combed > 1 && PP > 1) return; // TFMPP will display things instead

  /// TODO: draw the box
  (void)blockN;
  (void)xblocks;
//  if (combed > 1 && PP == 1 && blockN != -20)
//  {
//    drawBox(dst, blockx, blocky, blockN, xblocks, vi);
//  }

  std::string text = "TFM " VERSION " by tritical\n";

  if (PP > 0)
    snprintf(buf, SZ, "order = %d  field = %d  mode = %d  MI = %d\n", order, field, mode, MI);
  else
    snprintf(buf, SZ, "order = %d  field = %d  mode = %d\n", order, field, mode);
  text += buf;

  if (!over && !d2vmatch) snprintf(buf, SZ, "frame: %d  match = %c %s\n", n, MTC(fmatch),
    ((ubsco || mmsco || flags == 5) && checkSceneChange(prv, src, nxt, n)) ? " (SC) " : "");
  else if (d2vmatch) snprintf(buf, SZ, "frame: %d  match = %c (D2V) %s\n", n, MTC(fmatch),
    ((ubsco || mmsco || flags == 5) && checkSceneChange(prv, src, nxt, n)) ? " (SC) " : "");
  else snprintf(buf, SZ, "frame: %d  match = %c (OVR) %s\n", n, MTC(fmatch),
    ((ubsco || mmsco || flags == 5) && checkSceneChange(prv, src, nxt, n)) ? " (SC) " : "");
  text += buf;

  if (micout > 0 || (micmatching > 0 && mics[0] != -20 && mics[1] != -20 && mics[2] != -20
    && mics[3] != -20 && mics[4] != -20))
  {
    if (micout == 1 && mics[0] != -20 && mics[1] != -20 && mics[2] != -20 && micmatching == 0)
    {
      snprintf(buf, SZ, "MICS:  p = %d  c = %d  n = %d\n", mics[0], mics[1], mics[2]);
      text + buf;
    }
    else if ((micout == 2 && mics[0] != -20 && mics[1] != -20 && mics[2] != -20 &&
      mics[3] != -20 && mics[4] != -20) || micmatching > 0)
    {
      snprintf(buf, SZ, "MICS:  p = %d  c = %d  n = %d\n", mics[0], mics[1], mics[2]);
      text += buf;
      snprintf(buf, SZ, "       b = %d  u = %d\n", mics[3], mics[4]);
      text += buf;
    }
  }

  if (combed != -1)
  {
    if (combed == 1) snprintf(buf, SZ, "PP = %d  CLEAN FRAME (forced!) ", PP);
    else if (combed == 5) snprintf(buf, SZ, "PP = %d  COMBED FRAME  (forced!) ", PP);
    else if (combed == 0) snprintf(buf, SZ, "PP = %d  CLEAN FRAME ", PP);
    else snprintf(buf, SZ, "PP = %d  COMBED FRAME ", PP);
    if (mics[fmatch] >= 0)
    {
      char buft[20];
      snprintf(buft, 20, " MIC = %d ", mics[fmatch]);
      strcat(buf, buft);
    }

    text += buf;
    text += "\n";
  }

  if (d2vpercent >= 0.0)
  {
    snprintf(buf, SZ, "%3.1f%s FILM (D2V)\n", d2vpercent, "%");
    text += buf;
  }
#undef SZ

  VSMap *props = vsapi->getFramePropsRW(dst);
  vsapi->propSetData(props, PROP_TFMDisplay, text.c_str(), text.size(), paReplace);
}

// override from ovr file
void TFM::getSettingOvr(int n)
{
  if (setArray.size() == 0) return;
  for (int x = 0; x < (int)setArray.size(); x += 4)
  {
    if (n >= setArray[x + 1] && n <= setArray[x + 2])
    {
      if (setArray[x] == 111) order = setArray[x + 3]; // o
      else if (setArray[x] == 109) mode = setArray[x + 3]; // m
      else if (setArray[x] == 102) field = setArray[x + 3]; // f
      else if (setArray[x] == 80) PP = setArray[x + 3]; // P
      else if (setArray[x] == 105) MI = setArray[x + 3]; // i
    }
  }
}

bool TFM::getMatchOvr(int n, int &match, int &combed, bool &d2vmatch, bool isSC)
{
  bool combedset = false;
  d2vmatch = false;
  if (ovrArray.size() && ovrArray[n] != 255)
  {
    int value = ovrArray[n], temp;
    temp = value & 0x00000020;
    if (temp == 0 && PP > 0)
    {
      if (value & 0x00000010) combed = 5;
      else combed = 1;
      combedset = true;
    }
    temp = value & 0x00000007;
    if (temp >= 0 && temp <= 6)
    {
      match = temp;
      if (field != fieldO)
      {
        if (match == 0) match = 3;
        else if (match == 2) match = 4;
        else if (match == 3) match = 0;
        else if (match == 4) match = 2;
      }
      if (match == 5) { combed = 5; match = 1; field = 0; }
      else if (match == 6) { combed = 5; match = 1; field = 1; }
      return true;
    }
  }
  if (flags != 0 && flags != 3 && d2vfilmarray.size() && (d2vfilmarray[n] & D2VARRAY_MATCH_MASK))
  {
    int ct = (flags == 4 || (flags == 5 && isSC)) ? -1 : 0;
    int temp = d2vfilmarray[n];
    if ((flags == 1 || flags == 4 || flags == 5) && !(temp&(0x1 << 6))) return false;
    temp = (temp&D2VARRAY_MATCH_MASK) >> 2;
    if (temp != 1 && temp != 2) return false;
    if (temp == 1) { match = 1; combed = combedset ? combed : ct; }
    else if (temp == 2) { match = field^order ? 2 : 0; combed = combedset ? combed : ct; }
    d2vmatch = true;
    return true;
  }
  return false;
}

bool TFM::d2vduplicate(int match, int combed, int n)
{
  if (d2vfilmarray.size() == 0 || d2vfilmarray[n] == 0) return false;
  if (n - 1 != lastMatch.frame)
    lastMatch.field = lastMatch.frame = lastMatch.combed = lastMatch.match = -20;
  if ((d2vfilmarray[n] & D2VARRAY_DUP_MASK) == 0x3) // indicates possible top field duplicate
  {
    if (lastMatch.field == 1)
    {
      if ((lastMatch.combed > 1 || lastMatch.match != 3) && field == 1 &&
        (match != 4 || combed > 1)) return true;
      else if ((lastMatch.combed > 1 || lastMatch.match != 3) && field == 0 &&
        combed < 2 && match != 2) return true;
    }
    else if (lastMatch.field == 0)
    {
      if (lastMatch.combed < 2 && lastMatch.match != 0 && field == 1 &&
        (match != 4 || combed > 1)) return true;
      else if (lastMatch.combed < 2 && lastMatch.match != 0 && field == 0 &&
        combed < 2 && match != 2) return true;
    }
  }
  else if ((d2vfilmarray[n] & D2VARRAY_DUP_MASK) == 0x1) // indicates possible bottom field duplicate
  {
    if (lastMatch.field == 1)
    {
      if (lastMatch.combed < 2 && lastMatch.match != 0 && field == 0 &&
        (match != 4 || combed > 1)) return true;
      else if (lastMatch.combed < 2 && lastMatch.match != 0 && field == 1 &&
        combed < 2 && match != 2) return true;
    }
    else if (lastMatch.field == 0)
    {
      if ((lastMatch.combed > 1 || lastMatch.match != 3) && field == 0 &&
        (match != 4 || combed > 1)) return true;
      else if ((lastMatch.combed > 1 || lastMatch.match != 3) && field == 1 &&
        combed < 2 && match != 2) return true;
    }
  }
  return false;
}

void TFM::fileOut(int match, int combed, bool d2vfilm, int n, int MICount, int mics[5])
{
  if (moutArray.size() && MICount != -1) moutArray[n] = MICount;
  if (micout > 0 && moutArrayE.size())
  {
    int sn = micout == 1 ? 3 : 5;
    for (int i = 0; i < sn; ++i)
      moutArrayE[n*sn + i] = mics[i];
  }
  if (outArray.size() == 0) return;
  if (output.size() || outputC.size())
  {
    if (field != fieldO)
    {
      if (match == 0) match = 3;
      else if (match == 2) match = 4;
      else if (match == 3) match = 0;
      else if (match == 4) match = 2;
    }
    if (match == 1 && combed > 1 && field == 0) match = 5;
    else if (match == 1 && combed > 1 && field == 1) match = 6;
    unsigned char hint = 0;
    hint |= match;
    if (combed > 1) hint |= FILE_COMBED;
    else if (combed >= 0) hint |= FILE_NOTCOMBED;
    if (d2vfilm) hint |= FILE_D2V;
    hint |= FILE_ENTRY;
    outArray[n] = hint;
  }
}


bool TFM::checkCombed(const VSFrameRef *src, int n, int match,
  int *blockN, int &xblocksi, int *mics, bool ddebug)
{
    return checkCombedPlanar(src, n, match, blockN, xblocksi, mics, ddebug, vi->format->numPlanes > 1 && chroma);
}

int TFM::compareFields(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
  int match2, int& norm1, int& norm2, int& mtn1, int& mtn2, int n)
{
  if (vi->format->bytesPerSample == 1)
    return compareFields_core<uint8_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
  else
    return compareFields_core<uint16_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
}


template<typename pixel_t>
int TFM::compareFields_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
  int match2, int &norm1, int &norm2, int &mtn1, int &mtn2, int n)
{
    (void)n;

  const int bits_per_pixel = vi->format->bitsPerSample;

  int ret;
  int y0a, y1a; // exclusion regio

  const int stop = vi->format->numPlanes == 1 || !mChroma ? 1 : 3;
  const int incl = 1;  // pixel increments: 2 if YUY2 with no-chroma option otherwise 1

  uint64_t accumPc = 0, accumNc = 0;
  uint64_t accumPm = 0, accumNm = 0;
  norm1 = norm2 = mtn1 = mtn2 = 0;


  for (int b = 0; b < stop; ++b)
  {
    const int plane = b;

    uint8_t *mapp = vsapi->getWritePtr(map.get(), b);
    int map_pitch = vsapi->getStride(map.get(), b);

    const pixel_t* prvp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(prv, plane));
    const int prv_pitch = vsapi->getStride(prv, plane) / sizeof(pixel_t);

    const pixel_t* srcp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);

    const int Width = vsapi->getFrameWidth(src, plane);
    const int Height = vsapi->getFrameHeight(src, plane);

    const pixel_t* nxtp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(nxt, plane));
    const int nxt_pitch = vsapi->getStride(nxt, plane) / sizeof(pixel_t);

    const int startx = 8 >> (plane ? vi->format->subSamplingW : 0);
    const int stopx = Width - startx;

    const pixel_t* prvpf = nullptr, * curf = nullptr, * nxtpf = nullptr;
    int prvf_pitch = 0, curf_pitch, nxtf_pitch = 0;

    curf_pitch = src_pitch << 1;
    // exclusion area limits from parameters
    if (b == 0)
    { 
      y0a = y0; 
      y1a = y1;
    }
    else 
    { 
      const int ysubsampling = (plane ? vi->format->subSamplingH : 0);
      y0a = y0 >> ysubsampling;
      y1a = y1 >> ysubsampling;
    }
    const bool noBandExclusion = (y0a == y1a);
    if (y0a >= 2) y0a = y0a - 2; // v18: real limit, since y goes only till Height-2
    if (y1a <= Height - 2) y1a = y1a + 2; // v18: real limit, since y goes only from 2

    if (match1 < 3)
    {
      curf = srcp + ((3 - field)*src_pitch);
      mapp = mapp + ((field == 1 ? 1 : 2)*map_pitch);
    }
    if (match1 == 0)
    {
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match1 == 1)
    {
      prvf_pitch = src_pitch << 1;
      prvpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match1 == 2)
    {
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match1 == 3)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    else if (match1 == 4)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    if (match2 == 0)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match2 == 1)
    {
      nxtf_pitch = src_pitch << 1;
      nxtpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match2 == 2)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match2 == 3)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
    }
    else if (match2 == 4)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
    }

    const pixel_t* prvnf = prvpf + prvf_pitch;
    const pixel_t* curpf = curf - curf_pitch;
    const pixel_t* curnf = curf + curf_pitch;
    const pixel_t* nxtnf = nxtpf + nxtf_pitch;

    map_pitch <<= 1;
    uint8_t* mapn = mapp + map_pitch;

    // back to byte pointers
    if ((match1 >= 3 && field == 1) || (match1 < 3 && field != 1))
      buildDiffMapPlane2<pixel_t>(
        reinterpret_cast<const uint8_t*>(prvpf - prvf_pitch),
        reinterpret_cast<const uint8_t*>(nxtpf - nxtf_pitch),
        mapp - map_pitch,
        prvf_pitch * sizeof(pixel_t),
        nxtf_pitch * sizeof(pixel_t),
        map_pitch, Height >> 1, Width, bits_per_pixel);
    else
      buildDiffMapPlane2<pixel_t>(
        reinterpret_cast<const uint8_t*>(prvnf - prvf_pitch),
        reinterpret_cast<const uint8_t*>(nxtnf - nxtf_pitch),
        mapn - map_pitch,
        prvf_pitch * sizeof(pixel_t),
        nxtf_pitch * sizeof(pixel_t),
        map_pitch, Height >> 1, Width, bits_per_pixel);

    const int Const23 = 23 << (bits_per_pixel - 8);
    const int Const42 = 42 << (bits_per_pixel - 8);

    // TFM 874
    for (int y = 2; y < Height - 2; y += 2) {
      if ((y < y0a) || noBandExclusion || (y > y1a))  // exclusion area check
      {
        for (int x = startx; x < stopx; x += incl)
        {
          int eax = (mapp[x] << 2) + mapn[x];
          if ((eax & 0xFF) == 0)
            continue;

          int a_curr = curpf[x] + (curf[x] << 2) + curnf[x];
          int a_prev = 3 * (prvpf[x] + prvnf[x]);
          int diff_p_c = abs(a_prev - a_curr);
          if (diff_p_c > Const23) {
            accumPc += diff_p_c;
            if (diff_p_c > Const42 && ((eax & 10) != 0))
              accumPm += diff_p_c;
          }
          int a_next = 3 * (nxtpf[x] + nxtnf[x]);
          int diff_n_c = abs(a_next - a_curr);
          if (diff_n_c > Const23) {
            accumNc += diff_n_c;
            if (diff_n_c > Const42 && ((eax & 10) != 0))
              accumNm += diff_n_c;
          }
        }
      } // if

      mapp += map_pitch;
      prvpf += prvf_pitch;
      curpf += curf_pitch;
      prvnf += prvf_pitch;
      curf += curf_pitch;
      nxtpf += nxtf_pitch;
      curnf += curf_pitch;
      nxtnf += nxtf_pitch;
      mapn += map_pitch;
    }

#if 0
    // TFM 874
    __asm
    {
      push ebx // pf170421

      mov y, 2
      yloop:
      mov ecx, y0a
        mov edx, y1a
        cmp ecx, edx
        je xloop_pre
        mov eax, y
        cmp eax, ecx
        jl xloop_pre
        cmp eax, edx
        jle end_yloop
        xloop_pre :
      mov esi, incl
        mov ebx, startx
        mov edi, mapp
        mov edx, mapn
        mov ecx, stopx
        xloop :
      movzx eax, BYTE PTR[edi + ebx]
        shl eax, 2
        add al, BYTE PTR[edx + ebx]
        jnz b1
        add ebx, esi
        cmp ebx, ecx
        jl xloop
        jmp end_yloop
        b1 :
      mov edx, curf
        mov edi, curpf
        movzx ecx, BYTE PTR[edx + ebx]
        movzx esi, BYTE PTR[edi + ebx]
        shl ecx, 2
        mov edx, curnf
        add ecx, esi
        mov edi, prvpf
        movzx esi, BYTE PTR[edx + ebx]
        movzx edx, BYTE PTR[edi + ebx]
        add ecx, esi
        mov edi, prvnf
        movzx esi, BYTE PTR[edi + ebx]
        add edx, esi
        mov edi, edx
        add edx, edx
        sub edi, ecx
        add edx, edi
        jge b2
        neg edx
        b2 :
      cmp edx, 23
        jle p1
        add accumPc, edx
        cmp edx, 42
        jle p1
        test eax, 10
        jz p1
        add accumPm, edx
        p1 :
      mov edi, nxtpf
        mov esi, nxtnf
        movzx edx, BYTE PTR[edi + ebx]
        movzx edi, BYTE PTR[esi + ebx]
        add edx, edi
        mov esi, edx
        add edx, edx
        sub esi, ecx
        add edx, esi
        jge b3
        neg edx
        b3 :
      cmp edx, 23
        jle p2
        add accumNc, edx
        cmp edx, 42
        jle p2
        test eax, 10
        jz p2
        add accumNm, edx
        p2 :
      mov esi, incl
        mov ecx, stopx
        mov edi, mapp
        add ebx, esi
        mov edx, mapn
        cmp ebx, ecx
        jl xloop
        end_yloop :
      mov esi, Height
        mov eax, prvf_pitch
        mov ebx, curf_pitch
        mov ecx, nxtf_pitch
        mov edi, map_pitch
        sub esi, 2
        add y, 2
        add mapp, edi
        add prvpf, eax
        add curpf, ebx
        add prvnf, eax
        add curf, ebx
        add nxtpf, ecx
        add curnf, ebx
        add nxtnf, ecx
        add mapn, edi
        cmp y, esi
        jl yloop

        pop ebx // pf170421
    }
#endif
  }

  // High bit depth: I chose to scale back to 8 bit range.
  // Or else we should treat them as int64 and act upon them outside
  const double factor = 1.0 / (1 << (bits_per_pixel - 8));

  norm1 = (int)((accumPc / 6.0 * factor) + 0.5);
  norm2 = (int)((accumNc / 6.0 * factor) + 0.5);
  mtn1 = (int)((accumPm / 6.0 * factor) + 0.5);
  mtn2 = (int)((accumNm / 6.0 * factor) + 0.5);
  // TODO:  improve this decision about whether to use the mtn metrics or
  //        the normal metrics.  mtn metrics give better recognition of
  //        small areas ("mouths")... the hard part is telling when they
  //        are reliable enough to use.
  float c1 = float(std::max(norm1, norm2)) / float(std::max(std::min(norm1, norm2), 1));
  float c2 = float(std::max(mtn1, mtn2)) / float(std::max(std::min(mtn1, mtn2), 1));
  float mr = float(std::max(mtn1, mtn2)) / float(std::max(std::max(norm1, norm2), 1));
  if (((mtn1 >= 500 || mtn2 >= 500) && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1)) ||
    ((mtn1 >= 1000 || mtn2 >= 1000) && (mtn1 * 3 < mtn2 * 2 || mtn2 * 3 < mtn1 * 2)) ||
    ((mtn1 >= 2000 || mtn2 >= 2000) && (mtn1 * 5 < mtn2 * 4 || mtn2 * 5 < mtn1 * 4)) ||
    ((mtn1 >= 4000 || mtn2 >= 4000) && c2 > c1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else if (mr > 0.005 && std::max(mtn1, mtn2) > 150 && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else
  {
    if (norm1 > norm2) ret = match2;
    else ret = match1;
  }
//  if (debug)
//  {
//    sprintf(buf, "TFM:  frame %d  - comparing %c to %c\n", n, MTC(match1), MTC(match2));
//    OutputDebugString(buf);
//    sprintf(buf, "TFM:  frame %d  - nmatches:  %d vs %d (%3.1f)  mmatches:  %d vs %d (%3.1f)\n", n,
//      norm1, norm2, c1, mtn1, mtn2, c2);
//    OutputDebugString(buf);
//  }
  return ret;
}

int TFM::compareFieldsSlow(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
  int match2, int& norm1, int& norm2, int& mtn1, int& mtn2, int n)
{
  if (slow == 2) {
    if (vi->format->bytesPerSample == 1)
      return compareFieldsSlow2_core<uint8_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
    else
      return compareFieldsSlow2_core<uint16_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
  }
  if (vi->format->bytesPerSample == 1)
    return compareFieldsSlow_core<uint8_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
  else
    return compareFieldsSlow_core<uint16_t>(prv, src, nxt, match1, match2, norm1, norm2, mtn1, mtn2, n);
}

template<typename pixel_t>
int TFM::compareFieldsSlow_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
  int match2, int &norm1, int &norm2, int &mtn1, int &mtn2, int n)
{
    (void)n;

  const int bits_per_pixel = vi->format->bitsPerSample;

  int ret;
  int y0a, y1a;  // exclusion regio

  int tpitch_current;

  const int stop = vi->format->numPlanes == 1 || !mChroma ? 1 : 3;
  const int incl = 1;  // pixel increments: 2 if YUY2 with no-chroma option otherwise 1

  uint64_t accumPc = 0, accumNc = 0;
  uint64_t accumPm = 0, accumNm = 0;
  uint64_t accumPml = 0, accumNml = 0; // plus compared to CompareFields
  norm1 = norm2 = mtn1 = mtn2 = 0;

  for (int b = 0; b < stop; ++b)
  {
    const int plane = b;

    uint8_t* mapp = vsapi->getWritePtr(map.get(), b);
    int map_pitch = vsapi->getStride(map.get(), b);

    const pixel_t* prvp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(prv, plane));
    const int prv_pitch = vsapi->getStride(prv, plane) / sizeof(pixel_t);

    const pixel_t* srcp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);

    const int Width = vsapi->getFrameWidth(src, plane);
    const int Height = vsapi->getFrameHeight(src, plane);

    const pixel_t* nxtp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(nxt, plane));
    const int nxt_pitch = vsapi->getStride(nxt, plane) / sizeof(pixel_t);

    const int startx = 8 >> (plane ? vi->format->subSamplingW : 0);
    const int stopx = Width - startx;

    const pixel_t* prvpf = nullptr, * curf = nullptr, * nxtpf = nullptr;
    int prvf_pitch = 0, curf_pitch, nxtf_pitch = 0;

    curf_pitch = src_pitch << 1;

    memset(mapp, 0, Height * map_pitch);

    // exclusion area limits from parameters
    if (b == 0)
    { 
      y0a = y0; 
      y1a = y1; 
      tpitch_current = tpitchy; // plus compared to simple compareFields
    }
    else
    { 
      const int ysubsampling = vi->format->subSamplingH;
      y0a = y0 >> ysubsampling;
      y1a = y1 >> ysubsampling;
      tpitch_current = tpitchuv; // plus compared to simple compareFields
    }
    const bool noBandExclusion = (y0a == y1a);
    if (y0a >= 2) y0a = y0a - 2; // v18: real limit, since y goes only till Height-2
    if (y1a <= Height - 2) y1a = y1a + 2; // v18: real limit, since y goes only from 2

    if (match1 < 3)
    {
      curf = srcp + ((3 - field)*src_pitch);
      mapp = mapp + ((field == 1 ? 1 : 2)*map_pitch);
    }
    if (match1 == 0)
    {
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match1 == 1)
    {
      prvf_pitch = src_pitch << 1;
      prvpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match1 == 2)
    {
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match1 == 3)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    else if (match1 == 4)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    if (match2 == 0)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match2 == 1)
    {
      nxtf_pitch = src_pitch << 1;
      nxtpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match2 == 2)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match2 == 3)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
    }
    else if (match2 == 4)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
    }

    const pixel_t* prvnf = prvpf + prvf_pitch;
    const pixel_t* curpf = curf - curf_pitch;
    const pixel_t* curnf = curf + curf_pitch;
    const pixel_t* nxtnf = nxtpf + nxtf_pitch;

    map_pitch <<= 1;
    uint8_t* mapn = mapp + map_pitch;

    // back to byte pointers
      if ((match1 >= 3 && field == 1) || (match1 < 3 && field != 1))
        buildDiffMapPlane_Planar<pixel_t>(
          reinterpret_cast<const uint8_t*>(prvpf),
          reinterpret_cast<const uint8_t*>(nxtpf),
          mapp, 
          prvf_pitch * sizeof(pixel_t),
          nxtf_pitch * sizeof(pixel_t),
          map_pitch, Height, Width, tpitch_current, bits_per_pixel);
      else
        buildDiffMapPlane_Planar<pixel_t>(
          reinterpret_cast<const uint8_t*>(prvnf),
          reinterpret_cast<const uint8_t*>(nxtnf),
          mapn, 
          prvf_pitch * sizeof(pixel_t),
          nxtf_pitch * sizeof(pixel_t),
          map_pitch, Height, Width, tpitch_current, bits_per_pixel);

#ifdef USE_C_NO_ASM
    const int Const23 = 23 << (bits_per_pixel - 8);
    const int Const42 = 42 << (bits_per_pixel - 8);

    // TFM 1144
    // almost the same as in compareFields and buildDiffMapPlane2
    for (int y = 2; y < Height - 2; y += 2) {
      if ((y < y0a) || noBandExclusion || (y > y1a)) // exclusion area check
      {
        for (int x = startx; x < stopx; x += incl)
        {
          // diff from prev asm block (at buildDiffMapPlane2): <<3 instead of <<2
          int eax = (mapp[x] << 3) + mapn[x];
          if ((eax & 0xFF) == 0)
            continue;

          int a_curr = curpf[x] + (curf[x] << 2) + curnf[x];
          int a_prev = 3 * (prvpf[x] + prvnf[x]);
          int diff_p_c = abs(a_prev - a_curr);
          if (diff_p_c > Const23) {
            if((eax & 9) != 0) // diff from previous similar asm block: condition
              accumPc += diff_p_c;
            if (diff_p_c > Const42) {
              if ((eax & 18) != 0) // diff: &18 instead of &10
                accumPm += diff_p_c;
              if ((eax & 36) != 0) // diff: new condition and accumulator
                accumPml += diff_p_c;
            }
          }
          int a_next = 3 * (nxtpf[x] + nxtnf[x]);
          int diff_n_c = abs(a_next - a_curr);
          if (diff_n_c > Const23) {
            if ((eax & 9) != 0) // diff from previous similar asm block: condition
              accumNc += diff_n_c;
            if (diff_n_c > Const42) {
              if ((eax & 18) != 0) // diff: &18 instead of &10
                accumNm += diff_n_c;
              if ((eax & 36) != 0) // diff: &18 instead of &10
                accumNml += diff_n_c;
            }
          }
        }
      } // if

      mapp += map_pitch;
      prvpf += prvf_pitch;
      curpf += curf_pitch;
      prvnf += prvf_pitch;
      curf += curf_pitch;
      nxtpf += nxtf_pitch;
      curnf += curf_pitch;
      nxtnf += nxtf_pitch;
      mapn += map_pitch;
    }

#else
    // TFM 1144
    __asm
    {
      push ebx // pf170421

      mov y, 2
      yloop:
      mov ecx, y0a
        mov edx, y1a
        cmp ecx, edx
        je xloop_pre
        mov eax, y
        cmp eax, ecx
        jl xloop_pre
        cmp eax, edx
        jle end_yloop
        xloop_pre :
      mov esi, incl
        mov ebx, startx
        mov edi, mapp
        mov edx, mapn
        mov ecx, stopx
        xloop :
      movzx eax, BYTE PTR[edi + ebx]
        shl eax, 3
        add al, BYTE PTR[edx + ebx]
        jnz b1
        add ebx, esi
        cmp ebx, ecx
        jl xloop
        jmp end_yloop
        b1 :
      mov edx, curf
        mov edi, curpf
        movzx ecx, BYTE PTR[edx + ebx]
        movzx esi, BYTE PTR[edi + ebx]
        shl ecx, 2
        mov edx, curnf
        add ecx, esi
        mov edi, prvpf
        movzx esi, BYTE PTR[edx + ebx]
        movzx edx, BYTE PTR[edi + ebx]
        add ecx, esi
        mov edi, prvnf
        movzx esi, BYTE PTR[edi + ebx]
        add edx, esi
        mov edi, edx
        add edx, edx
        sub edi, ecx
        add edx, edi
        jge b3
        neg edx
        b3 :
      cmp edx, 23
        jle p3
        test eax, 9
        jz p1
        add accumPc, edx
        p1 :
      cmp edx, 42
        jle p3
        test eax, 18
        jz p2
        add accumPm, edx
        p2 :
      test eax, 36
        jz p3
        add accumPml, edx
        p3 :
      mov edi, nxtpf
        mov esi, nxtnf
        movzx edx, BYTE PTR[edi + ebx]
        movzx edi, BYTE PTR[esi + ebx]
        add edx, edi
        mov esi, edx
        add edx, edx
        sub esi, ecx
        add edx, esi
        jge b2
        neg edx
        b2 :
      cmp edx, 23
        jle p6
        test eax, 9
        jz p4
        add accumNc, edx
        p4 :
      cmp edx, 42
        jle p6
        test eax, 18
        jz p5
        add accumNm, edx
        p5 :
      test eax, 36
        jz p6
        add accumNml, edx
        p6 :
      mov esi, incl
        mov ecx, stopx
        mov edi, mapp
        add ebx, esi
        mov edx, mapn
        cmp ebx, ecx
        jl xloop
        end_yloop :
      mov esi, Height
        mov eax, prvf_pitch
        mov ebx, curf_pitch
        mov ecx, nxtf_pitch
        mov edi, map_pitch
        sub esi, 2
        add y, 2
        add mapp, edi
        add prvpf, eax
        add curpf, ebx
        add prvnf, eax
        add curf, ebx
        add nxtpf, ecx
        add curnf, ebx
        add nxtnf, ecx
        add mapn, edi
        cmp y, esi
        jl yloop

        pop ebx // pf170421

    }
#endif
  }

  const unsigned int Const500 = 500 << (bits_per_pixel - 8);
  if (accumPm < Const500 && accumNm < Const500 && (accumPml >= Const500 || accumNml >= Const500) &&
    std::max(accumPml, accumNml) > 3 * std::min(accumPml, accumNml))
  {
    accumPm = accumPml;
    accumNm = accumNml;
  }

  // High bit depth: I chose to scale back to 8 bit range.
  // Or else we should treat them as int64 and act upon them outside
  const double factor = 1.0 / (1 << (bits_per_pixel - 8));

  norm1 = (int)((accumPc / 6.0 * factor) + 0.5);
  norm2 = (int)((accumNc / 6.0 * factor) + 0.5);
  mtn1 = (int)((accumPm / 6.0 * factor) + 0.5);
  mtn2 = (int)((accumNm / 6.0 * factor) + 0.5);
  // we are in the 8bit normalized region again, no change from here
  float c1 = float(std::max(norm1, norm2)) / float(std::max(std::min(norm1, norm2), 1));
  float c2 = float(std::max(mtn1, mtn2)) / float(std::max(std::min(mtn1, mtn2), 1));
  float mr = float(std::max(mtn1, mtn2)) / float(std::max(std::max(norm1, norm2), 1));
  if (((mtn1 >= 375 || mtn2 >= 375) && (mtn1 * 3 < mtn2 * 1 || mtn2 * 3 < mtn1 * 1)) ||
    ((mtn1 >= 500 || mtn2 >= 500) && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1)) ||
    ((mtn1 >= 1000 || mtn2 >= 1000) && (mtn1 * 3 < mtn2 * 2 || mtn2 * 3 < mtn1 * 2)) ||
    ((mtn1 >= 2000 || mtn2 >= 2000) && (mtn1 * 5 < mtn2 * 4 || mtn2 * 5 < mtn1 * 4)) ||
    ((mtn1 >= 4000 || mtn2 >= 4000) && c2 > c1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else if (mr > 0.005 && std::max(mtn1, mtn2) > 150 && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else
  {
    if (norm1 > norm2) ret = match2;
    else ret = match1;
  }
//  if (debug)
//  {
//    sprintf(buf, "TFM:  frame %d  - comparing %c to %c  (SLOW 1)\n", n, MTC(match1), MTC(match2));
//    OutputDebugString(buf);
//    sprintf(buf, "TFM:  frame %d  - nmatches:  %d vs %d (%3.1f)  mmatches:  %d vs %d (%3.1f)\n", n,
//      norm1, norm2, c1, mtn1, mtn2, c2);
//    OutputDebugString(buf);
//  }
  return ret;
}

template<typename pixel_t>
int TFM::compareFieldsSlow2_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
  int match2, int &norm1, int &norm2, int &mtn1, int &mtn2, int n)
{
    (void)n;

  const int bits_per_pixel = vi->format->bitsPerSample;

  int ret;
  int y0a, y1a;  // exclusion regio

  int tpitch_current;

  const int stop = vi->format->numPlanes == 1 || !mChroma ? 1 : 3;
  int incl = 1;  // pixel increments: 2 if YUY2 with no-chroma option otherwise 1

  uint64_t accumPc = 0, accumNc = 0;
  uint64_t accumPm = 0, accumNm = 0;
  uint64_t accumPml = 0, accumNml = 0; // plus compared to CompareFields
  norm1 = norm2 = mtn1 = mtn2 = 0;
  
  for (int b = 0; b < stop; ++b)
  {
    const int plane = b;
    uint8_t* mapp = vsapi->getWritePtr(map.get(), b);
    int map_pitch = vsapi->getStride(map.get(), b);

    const pixel_t* prvp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(prv, plane));
    const int prv_pitch = vsapi->getStride(prv, plane) / sizeof(pixel_t);

    const pixel_t* srcp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);

    const int Width = vsapi->getFrameWidth(src, plane);
    const int Height = vsapi->getFrameHeight(src, plane);

    const pixel_t* nxtp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(nxt, plane));
    const int nxt_pitch = vsapi->getStride(nxt, plane) / sizeof(pixel_t);

    const int startx = 8 >> (plane ? vi->format->subSamplingW : 0);
    const int stopx = Width - startx;

    const pixel_t* prvpf = nullptr, * curf = nullptr, * nxtpf = nullptr;
    int prvf_pitch = 0, curf_pitch, nxtf_pitch = 0;

    curf_pitch = src_pitch << 1;

    memset(mapp, 0, Height * map_pitch);

    // exclusion area limits from parameters
    if (b == 0)
    {
      y0a = y0;
      y1a = y1;
      tpitch_current = tpitchy;
    }
    else 
    { 
      const int ysubsampling = vi->format->subSamplingH;
      y0a = y0 >> ysubsampling;
      y1a = y1 >> ysubsampling;
      tpitch_current = tpitchuv;
    }
    const bool noBandExclusion = (y0a == y1a);
    if (y0a >= 2) y0a = y0a - 2; // v18: real limit, since y goes only till Height-2
    if (y1a <= Height - 2) y1a = y1a + 2; // v18: real limit, since y goes only from 2

    if (match1 < 3)
    {
      curf = srcp + ((3 - field)*src_pitch);
      mapp = mapp + ((field == 1 ? 1 : 2)*map_pitch);
    }
    if (match1 == 0)
    {
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match1 == 1)
    {
      prvf_pitch = src_pitch << 1;
      prvpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match1 == 2)
    {
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match1 == 3)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = prv_pitch << 1;
      prvpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    else if (match1 == 4)
    {
      curf = srcp + ((2 + field)*src_pitch);
      prvf_pitch = nxt_pitch << 1;
      prvpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
      mapp = mapp + ((field == 1 ? 2 : 1)*map_pitch);
    }
    if (match2 == 0)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 1 : 2)*prv_pitch);
    }
    else if (match2 == 1)
    {
      nxtf_pitch = src_pitch << 1;
      nxtpf = srcp + ((field == 1 ? 1 : 2)*src_pitch);
    }
    else if (match2 == 2)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 1 : 2)*nxt_pitch);
    }
    else if (match2 == 3)
    {
      nxtf_pitch = prv_pitch << 1;
      nxtpf = prvp + ((field == 1 ? 2 : 1)*prv_pitch);
    }
    else if (match2 == 4)
    {
      nxtf_pitch = nxt_pitch << 1;
      nxtpf = nxtp + ((field == 1 ? 2 : 1)*nxt_pitch);
    }

    const pixel_t* prvppf = prvpf - prvf_pitch;
    const pixel_t* prvnf = prvpf + prvf_pitch;
    const pixel_t* prvnnf = prvnf + prvf_pitch;

    const pixel_t* curpf = curf - curf_pitch;
    const pixel_t* curnf = curf + curf_pitch;

    const pixel_t* nxtppf = nxtpf - nxtf_pitch;
    const pixel_t* nxtnf = nxtpf + nxtf_pitch;
    const pixel_t* nxtnnf = nxtnf + nxtf_pitch;

    map_pitch <<= 1;
    uint8_t* mapn = mapp + map_pitch;

    // back to byte pointers
      if ((match1 >= 3 && field == 1) || (match1 < 3 && field != 1))
        buildDiffMapPlane_Planar<pixel_t>(
          reinterpret_cast<const uint8_t*>(prvpf),
          reinterpret_cast<const uint8_t*>(nxtpf),
          mapp,
          prvf_pitch * sizeof(pixel_t),
          nxtf_pitch * sizeof(pixel_t),
          map_pitch, Height, Width, tpitch_current, bits_per_pixel);
      else
        buildDiffMapPlane_Planar<pixel_t>(
          reinterpret_cast<const uint8_t*>(prvnf),
          reinterpret_cast<const uint8_t*>(nxtnf),
          mapn,
          prvf_pitch * sizeof(pixel_t),
          nxtf_pitch * sizeof(pixel_t),
          map_pitch, Height, Width, tpitch_current, bits_per_pixel);

    const int Const23 = 23 << (bits_per_pixel - 8);
    const int Const42 = 42 << (bits_per_pixel - 8);

    if (field == 0) {
    // TFM 1436
    // almost the same as in TFM 1144
      for (int y = 2; y < Height - 2; y += 2) {
        if ((y < y0a) || noBandExclusion || (y > y1a))
        {
          for (int x = startx; x < stopx; x += incl)
          {
            int eax = (mapp[x] << 3) + mapn[x]; // diff from prev asm block (at buildDiffMapPlane2): <<3 instead of <<2
            if ((eax & 0xFF) == 0)
              continue;

            int a_curr = curpf[x] + (curf[x] << 2) + curnf[x];
            int a_prev = 3 * (prvpf[x] + prvnf[x]);
            int diff_p_c = abs(a_prev - a_curr);
            if (diff_p_c > Const23) {
              if ((eax & 9) != 0) // diff from previous similar asm block: condition
                accumPc += diff_p_c;
              if (diff_p_c > Const42) {
                if ((eax & 18) != 0) // diff: &18 instead of &10
                  accumPm += diff_p_c;
                if ((eax & 36) != 0) // diff: new condition and accumulator
                  accumPml += diff_p_c;
              }
            }
            int a_next = 3 * (nxtpf[x] + nxtnf[x]);
            int diff_n_c = abs(a_next - a_curr);
            if (diff_n_c > Const23) {
              if ((eax & 9) != 0) // diff from previous similar asm block: condition
                accumNc += diff_n_c;
              if (diff_n_c > Const42) {
                if ((eax & 18) != 0) // diff: &18 instead of &10
                  accumNm += diff_n_c;
                if ((eax & 36) != 0) // diff: &18 instead of &10
                  accumNml += diff_n_c;
              }
            }

            // additional difference from TFM 1144
            if ((eax & 56) != 0) {

              a_prev = prvppf[x] + (prvpf[x] << 2) + prvnf[x];
              a_curr = 3 * (curpf[x] + curf[x]);
              diff_p_c = abs(a_prev - a_curr);
              if (diff_p_c > Const23) {
                if ((eax & 8) != 0) // diff from previous similar asm block: condition
                  accumPc += diff_p_c;
                if (diff_p_c > Const42) {
                  if ((eax & 16) != 0) // diff: &16 instead of &18
                    accumPm += diff_p_c;
                  if ((eax & 32) != 0) // diff: new condition and accumulator
                    accumPml += diff_p_c;
                }
              }
              a_next = nxtppf[x] + (nxtpf[x] << 2) + nxtnf[x]; // really! not 3*
              diff_n_c = abs(a_next - a_curr);
              if (diff_n_c > Const23) {
                if ((eax & 8) != 0) // diff: &8 instead of &9
                  accumNc += diff_n_c;
                if (diff_n_c > Const42) {
                  if ((eax & 16) != 0) // diff: &16 instead of &18
                    accumNm += diff_n_c;
                  if ((eax & 32) != 0) // diff: &32 instead of &36
                    accumNml += diff_n_c;
                }
              }
            }
          }
        } // if

        mapp += map_pitch;
        prvpf += prvf_pitch;
        curpf += curf_pitch;
        prvnf += prvf_pitch;
        curf += curf_pitch;
        nxtpf += nxtf_pitch;
        curnf += curf_pitch;
        nxtnf += nxtf_pitch;
        mapn += map_pitch;

        prvppf += prvf_pitch;
        nxtppf += nxtf_pitch;
      }
    }
    else {
      // TFM 1633
      // almost the same as in TFM 1436 (field==0= case)
      // differences are after eax&56 block, see later

      for (int y = 2; y < Height - 2; y += 2) {
        if ((y < y0a) || noBandExclusion || (y > y1a))
        {
          for (int x = startx; x < stopx; x += incl)
          {
            int eax = (mapp[x] << 3) + mapn[x]; // diff from prev asm block (at buildDiffMapPlane2): <<3 instead of <<2
            if ((eax & 0xFF) == 0)
              continue;

            int a_curr = curpf[x] + (curf[x] << 2) + curnf[x];
            int a_prev = 3 * (prvpf[x] + prvnf[x]);
            int diff_p_c = abs(a_prev - a_curr);
            if (diff_p_c > Const23) {
              if ((eax & 9) != 0) // diff from previous similar asm block: condition
                accumPc += diff_p_c;
              if (diff_p_c > Const42) {
                if ((eax & 18) != 0) // diff: &18 instead of &10
                  accumPm += diff_p_c;
                if ((eax & 36) != 0) // diff: new condition and accumulator
                  accumPml += diff_p_c;
              }
            }
            int a_next = 3 * (nxtpf[x] + nxtnf[x]); // L2008
            int diff_n_c = abs(a_next - a_curr);
            if (diff_n_c > Const23) {
              if ((eax & 9) != 0) // diff from previous similar asm block: condition
                accumNc += diff_n_c;
              if (diff_n_c > Const42) {
                if ((eax & 18) != 0) // diff: &18 instead of &10
                  accumNm += diff_n_c;
                if ((eax & 36) != 0) // diff: &18 instead of &10
                  accumNml += diff_n_c;
              }
            }

            // difference from TFM 1436
            // prvpf  -> prvnf
            // prvppf -> prvpf
            // prvnf  -> prvnnf
            // curpf  -> curf
            // curf   -> curnf
            // nxtpf  -> nxtnf
            // nxtppf -> nxtpf
            // nxtnf  -> nxtnnf
            // mask 8/16/32 -> 1/2/4
            if ((eax & 7) != 0) { // 1.0.12: diff: &7 instead of &56 L2036

              a_prev = prvpf[x] + (prvnf[x] << 2) + prvnnf[x];
              a_curr = 3 * (curf[x] + curnf[x]);
              diff_p_c = abs(a_prev - a_curr);
              if (diff_p_c > Const23) {
                if ((eax & 1) != 0) // diff: &1 instead of &8
                  accumPc += diff_p_c;
                if (diff_p_c > Const42) {
                  if ((eax & 2) != 0) // diff: &2 instead of &16
                    accumPm += diff_p_c;
                  if ((eax & 4) != 0) // diff: &4 instead of &32
                    accumPml += diff_p_c;
                }
              }
              //int a_next = *(nxtppf + ebx) + (*(nxtpf + ebx) << 2) + *(nxtnf + ebx); // really! not 3*
              a_next = nxtpf[x] + (nxtnf[x] << 2) + nxtnnf[x]; // really! not 3* L2075
              diff_n_c = abs(a_next - a_curr);
              if (diff_n_c > Const23) { // L2088
                if ((eax & 1) != 0) // diff: &1 instead of &8
                  accumNc += diff_n_c;
                if (diff_n_c > Const42) { // L2094
                  if ((eax & 2) != 0) // diff: &2 instead of &16 // 1.0.12 really 2
                    accumNm += diff_n_c;
                  if ((eax & 4) != 0) // diff: &4 instead of &32
                    accumNml += diff_n_c;
                }
              }
            }
          }
        } // if

        mapp += map_pitch;
        prvpf += prvf_pitch;
        curpf += curf_pitch;
        prvnf += prvf_pitch;
        curf += curf_pitch;
        prvnnf += prvf_pitch; // 1.0.12
        nxtpf += nxtf_pitch;
        curnf += curf_pitch;
        nxtnf += nxtf_pitch;
        nxtnnf += nxtf_pitch;
        mapn += map_pitch;

        // not used prvppf += prvf_pitch;
        // not used nxtppf += nxtf_pitch;

      }

    }

#if 0
    if (field == 0)
    {
      // TFM 1436
      __asm
      {
        push ebx // pf170421

        mov y, 2
        yloop0:
        mov ecx, y0a
          mov edx, y1a
          cmp ecx, edx
          je xloop_pre0
          mov eax, y
          cmp eax, ecx
          jl xloop_pre0
          cmp eax, edx
          jle end_yloop0
          xloop_pre0 :
        mov esi, incl
          mov ebx, startx
          mov edi, mapp
          mov edx, mapn
          mov ecx, stopx
          xloop0 :
        movzx eax, BYTE PTR[edi + ebx]
          shl eax, 3
          add al, BYTE PTR[edx + ebx]
          jnz b10
          add ebx, esi
          cmp ebx, ecx
          jl xloop0
          jmp end_yloop0
          b10 :
        mov edx, curf
          mov edi, curpf
          movzx ecx, BYTE PTR[edx + ebx]
          movzx esi, BYTE PTR[edi + ebx]
          shl ecx, 2
          mov edx, curnf
          add ecx, esi
          mov edi, prvpf
          movzx esi, BYTE PTR[edx + ebx]
          movzx edx, BYTE PTR[edi + ebx]
          add ecx, esi
          mov edi, prvnf
          movzx esi, BYTE PTR[edi + ebx]
          add edx, esi
          mov edi, edx
          add edx, edx
          sub edi, ecx
          add edx, edi
          jge b30
          neg edx
          b30 :
        cmp edx, 23
          jle p30
          test eax, 9
          jz p10
          add accumPc, edx
          p10 :
        cmp edx, 42
          jle p30
          test eax, 18
          jz p20
          add accumPm, edx
          p20 :
        test eax, 36
          jz p30
          add accumPml, edx
          p30 :
        mov edi, nxtpf
          mov esi, nxtnf
          movzx edx, BYTE PTR[edi + ebx]
          movzx edi, BYTE PTR[esi + ebx]
          add edx, edi
          mov esi, edx
          add edx, edx
          sub esi, ecx
          add edx, esi
          jge b20
          neg edx
          b20 :
        cmp edx, 23
          jle p60
          test eax, 9
          jz p40
          add accumNc, edx
          p40 :
        cmp edx, 42
          jle p60
          test eax, 18
          jz p50
          add accumNm, edx
          p50 :
        test eax, 36
          jz p60
          add accumNml, edx
          p60 :
        test eax, 56
          jz p120
          mov ecx, prvpf
          mov edi, prvppf
          movzx edx, BYTE PTR[ecx + ebx]
          movzx esi, BYTE PTR[edi + ebx]
          shl edx, 2
          mov ecx, prvnf
          add edx, esi
          mov edi, curpf
          movzx esi, BYTE PTR[ecx + ebx]
          movzx ecx, BYTE PTR[edi + ebx]
          add edx, esi
          mov edi, curf
          movzx esi, BYTE PTR[edi + ebx]
          add ecx, esi
          mov edi, ecx
          add ecx, ecx
          add ecx, edi
          sub edx, ecx
          jge b40
          neg edx
          b40 :
        cmp edx, 23
          jle p90
          test eax, 8
          jz p70
          add accumPc, edx
          p70 :
        cmp edx, 42
          jle p90
          test eax, 16
          jz p80
          add accumPm, edx
          p80 :
        test eax, 32
          jz p90
          add accumPml, edx
          p90 :
        mov edi, nxtpf
          mov esi, nxtppf
          movzx edx, BYTE PTR[edi + ebx]
          movzx edi, BYTE PTR[esi + ebx]
          shl edx, 2
          mov esi, nxtnf
          add edx, edi
          movzx edi, BYTE PTR[esi + ebx]
          add edx, edi
          sub edx, ecx
          jge b50
          neg edx
          b50 :
        cmp edx, 23
          jle p120
          test eax, 8
          jz p100
          add accumNc, edx
          p100 :
        cmp edx, 42
          jle p120
          test eax, 16
          jz p110
          add accumNm, edx
          p110 :
        test eax, 32
          jz p120
          add accumNml, edx
          p120 :
        mov esi, incl
          mov ecx, stopx
          mov edi, mapp
          add ebx, esi
          mov edx, mapn
          cmp ebx, ecx
          jl xloop0
          end_yloop0 :
        mov esi, Height
          mov eax, prvf_pitch
          mov ebx, curf_pitch
          mov ecx, nxtf_pitch
          mov edi, map_pitch
          sub esi, 2
          add y, 2
          add mapp, edi
          add prvpf, eax
          add curpf, ebx
          add prvnf, eax
          add curf, ebx
          add nxtpf, ecx
          add prvppf, eax
          add curnf, ebx
          add nxtnf, ecx
          add mapn, edi
          add nxtppf, ecx
          cmp y, esi
          jl yloop0

          pop ebx // pf170421
      }
    }
    else
    {
      // TFM 1633
      __asm
      {
        push ebx // pf170421

        mov y, 2
        yloop1:
        mov ecx, y0a
          mov edx, y1a
          cmp ecx, edx
          je xloop_pre1
          mov eax, y
          cmp eax, ecx
          jl xloop_pre1
          cmp eax, edx
          jle end_yloop1
          xloop_pre1 :
        mov esi, incl
          mov ebx, startx
          mov edi, mapp
          mov edx, mapn
          mov ecx, stopx
          xloop1 :
        movzx eax, BYTE PTR[edi + ebx]
          shl eax, 3
          add al, BYTE PTR[edx + ebx]
          jnz b11
          add ebx, esi
          cmp ebx, ecx
          jl xloop1
          jmp end_yloop1
          b11 :
        mov edx, curf
          mov edi, curpf
          movzx ecx, BYTE PTR[edx + ebx]
          movzx esi, BYTE PTR[edi + ebx]
          shl ecx, 2
          mov edx, curnf
          add ecx, esi
          mov edi, prvpf
          movzx esi, BYTE PTR[edx + ebx]
          movzx edx, BYTE PTR[edi + ebx]
          add ecx, esi
          mov edi, prvnf
          movzx esi, BYTE PTR[edi + ebx]
          add edx, esi
          mov edi, edx
          add edx, edx
          sub edi, ecx
          add edx, edi
          jge b31
          neg edx
          b31 :
        cmp edx, 23
          jle p31
          test eax, 9
          jz p11
          add accumPc, edx
          p11 :
        cmp edx, 42
          jle p31
          test eax, 18
          jz p21
          add accumPm, edx
          p21 :
        test eax, 36
          jz p31
          add accumPml, edx
          p31 :
        mov edi, nxtpf
          mov esi, nxtnf
          movzx edx, BYTE PTR[edi + ebx]
          movzx edi, BYTE PTR[esi + ebx]
          add edx, edi
          mov esi, edx
          add edx, edx
          sub esi, ecx
          add edx, esi
          jge b21
          neg edx
          b21 :
        cmp edx, 23
          jle p61
          test eax, 9
          jz p41
          add accumNc, edx
          p41 :
        cmp edx, 42
          jle p61
          test eax, 18
          jz p51
          add accumNm, edx
          p51 :
        test eax, 36
          jz p61
          add accumNml, edx
          p61 :
        test eax, 7
          jz p121
          mov ecx, prvnf
          mov edi, prvpf
          movzx edx, BYTE PTR[ecx + ebx]
          movzx esi, BYTE PTR[edi + ebx]
          shl edx, 2
          mov ecx, prvnnf
          add edx, esi
          mov edi, curf
          movzx esi, BYTE PTR[ecx + ebx]
          movzx ecx, BYTE PTR[edi + ebx]
          add edx, esi
          mov edi, curnf
          movzx esi, BYTE PTR[edi + ebx]
          add ecx, esi
          mov edi, ecx
          add ecx, ecx
          add ecx, edi
          sub edx, ecx
          jge b41
          neg edx
          b41 :
        cmp edx, 23
          jle p91
          test eax, 1
          jz p71
          add accumPc, edx
          p71 :
        cmp edx, 42
          jle p91
          test eax, 2
          jz p81
          add accumPm, edx
          p81 :
        test eax, 4
          jz p91
          add accumPml, edx
          p91 :
        mov edi, nxtnf
          mov esi, nxtpf
          movzx edx, BYTE PTR[edi + ebx]
          movzx edi, BYTE PTR[esi + ebx]
          shl edx, 2
          mov esi, nxtnnf
          add edx, edi
          movzx edi, BYTE PTR[esi + ebx]
          add edx, edi
          sub edx, ecx
          jge b51
          neg edx
          b51 :
        cmp edx, 23
          jle p121
          test eax, 1
          jz p101
          add accumNc, edx
          p101 :
        cmp edx, 42
          jle p121
          test eax, 2
          jz p111
          add accumNm, edx
          p111 :
        test eax, 4
          jz p121
          add accumNml, edx
          p121 :
        mov esi, incl
          mov ecx, stopx
          mov edi, mapp
          add ebx, esi
          mov edx, mapn
          cmp ebx, ecx
          jl xloop1
          end_yloop1 :
        mov esi, Height
          mov eax, prvf_pitch
          mov ebx, curf_pitch
          mov ecx, nxtf_pitch
          mov edi, map_pitch
          sub esi, 2
          add y, 2
          add mapp, edi
          add prvpf, eax
          add curpf, ebx
          add prvnf, eax
          add curf, ebx
          add prvnnf, eax
          add nxtpf, ecx
          add curnf, ebx
          add nxtnf, ecx
          add mapn, edi
          add nxtnnf, ecx
          cmp y, esi
          jl yloop1

          pop ebx // pf170421

      }
    }
#endif
  }

  const unsigned int Const500 = 500 << (bits_per_pixel - 8);
  if (accumPm < Const500 && accumNm < Const500 && (accumPml >= Const500 || accumNml >= Const500) &&
    std::max(accumPml, accumNml) > 3 * std::min(accumPml, accumNml))
  {
    accumPm = accumPml;
    accumNm = accumNml;
  }

  // High bit depth: I chose to scale back to 8 bit range.
  // Or else we should treat them as int64 and act upon them outside
  const double factor = 1.0 / (1 << (bits_per_pixel - 8));

  norm1 = (int)((accumPc / 6.0 * factor) + 0.5);
  norm2 = (int)((accumNc / 6.0 * factor) + 0.5);
  mtn1 = (int)((accumPm / 6.0 * factor) + 0.5);
  mtn2 = (int)((accumNm / 6.0 * factor) + 0.5);
  // we are in the 8bit normalized region again, no change from here
  float c1 = float(std::max(norm1, norm2)) / float(std::max(std::min(norm1, norm2), 1));
  float c2 = float(std::max(mtn1, mtn2)) / float(std::max(std::min(mtn1, mtn2), 1));
  float mr = float(std::max(mtn1, mtn2)) / float(std::max(std::max(norm1, norm2), 1));
  if (((mtn1 >= 250 || mtn2 >= 250) && (mtn1 * 4 < mtn2 * 1 || mtn2 * 4 < mtn1 * 1)) ||
    ((mtn1 >= 375 || mtn2 >= 375) && (mtn1 * 3 < mtn2 * 1 || mtn2 * 3 < mtn1 * 1)) ||
    ((mtn1 >= 500 || mtn2 >= 500) && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1)) ||
    ((mtn1 >= 1000 || mtn2 >= 1000) && (mtn1 * 3 < mtn2 * 2 || mtn2 * 3 < mtn1 * 2)) ||
    ((mtn1 >= 2000 || mtn2 >= 2000) && (mtn1 * 5 < mtn2 * 4 || mtn2 * 5 < mtn1 * 4)) ||
    ((mtn1 >= 4000 || mtn2 >= 4000) && c2 > c1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else if (mr > 0.005 && std::max(mtn1, mtn2) > 150 && (mtn1 * 2 < mtn2 * 1 || mtn2 * 2 < mtn1 * 1))
  {
    if (mtn1 > mtn2) ret = match2;
    else ret = match1;
  }
  else
  {
    if (norm1 > norm2) ret = match2;
    else ret = match1;
  }
//  if (debug)
//  {
//    sprintf(buf, "TFM:  frame %d  - comparing %c to %c  (SLOW 2)\n", n, MTC(match1), MTC(match2));
//    OutputDebugString(buf);
//    sprintf(buf, "TFM:  frame %d  - nmatches:  %d vs %d (%3.1f)  mmatches:  %d vs %d (%3.1f)\n", n,
//      norm1, norm2, c1, mtn1, mtn2, c2);
//    OutputDebugString(buf);
//  }
  return ret;
}

template<typename pixel_t>
static void checkSceneChangePlanar_1_c(const pixel_t* srcp, const pixel_t* nxtp,
  int height, int width, int src_pitch, int nxt_pitch, uint64_t& diff)
{
  for (int y = 0; y < height; ++y)
  {
    uint32_t rowdiff = 0;
    for (int x = 0; x < width; x += 4)
    {
      rowdiff += abs(srcp[x + 0] - nxtp[x + 0]);
      rowdiff += abs(srcp[x + 1] - nxtp[x + 1]);
      rowdiff += abs(srcp[x + 2] - nxtp[x + 2]);
      rowdiff += abs(srcp[x + 3] - nxtp[x + 3]);
    }
    diff += rowdiff;
    srcp += src_pitch;
    nxtp += nxt_pitch;
  }
}

//void checkSceneChangeYUY2_1_c(const uint8_t* srcp, const uint8_t* nxtp,
//int height, int width, int src_pitch, int nxt_pitch, uint64_t& diff)
//{
//  for (int y = 0; y < height; ++y)
//  {
//    uint32_t rowdiff = 0;
//    for (int x = 0; x < width; x += 8)
//    {
//      rowdiff += abs(srcp[x + 0] - nxtp[x + 0]);
//      rowdiff += abs(srcp[x + 2] - nxtp[x + 2]);
//      rowdiff += abs(srcp[x + 4] - nxtp[x + 4]);
//      rowdiff += abs(srcp[x + 6] - nxtp[x + 6]);
//    }
//    diff += rowdiff;
//    srcp += src_pitch;
//    nxtp += nxt_pitch;
//  }
//}

template<typename pixel_t>
static void checkSceneChangePlanar_2_c(const pixel_t* prvp, const pixel_t* srcp,
  const pixel_t* nxtp, int height, int width, int prv_pitch, int src_pitch,
  int nxt_pitch, uint64_t& diffp, uint64_t& diffn)
{
  for (int y = 0; y < height; ++y)
  {
    uint32_t rowdiffp = 0;
    uint32_t rowdiffn = 0;
    for (int x = 0; x < width; x += 4)
    {
      rowdiffp += abs(srcp[x + 0] - prvp[x + 0]);
      rowdiffp += abs(srcp[x + 1] - prvp[x + 1]);
      rowdiffp += abs(srcp[x + 2] - prvp[x + 2]);
      rowdiffp += abs(srcp[x + 3] - prvp[x + 3]);
      rowdiffn += abs(srcp[x + 0] - nxtp[x + 0]);
      rowdiffn += abs(srcp[x + 1] - nxtp[x + 1]);
      rowdiffn += abs(srcp[x + 2] - nxtp[x + 2]);
      rowdiffn += abs(srcp[x + 3] - nxtp[x + 3]);
    }
    diffp += rowdiffp;
    diffn += rowdiffn;
    prvp += prv_pitch;
    srcp += src_pitch;
    nxtp += nxt_pitch;
  }
}

//static void checkSceneChangeYUY2_2_c(const uint8_t* prvp, const uint8_t* srcp,
//  const uint8_t* nxtp, int height, int width, int prv_pitch, int src_pitch,
//  int nxt_pitch, uint64_t& diffp, uint64_t& diffn)
//{
//  for (int y = 0; y < height; ++y)
//  {
//    uint32_t rowdiffp = 0;
//    uint32_t rowdiffn = 0;
//    for (int x = 0; x < width; x += 8)
//    {
//      rowdiffp += abs(srcp[x + 0] - prvp[x + 0]);
//      rowdiffp += abs(srcp[x + 2] - prvp[x + 2]);
//      rowdiffp += abs(srcp[x + 4] - prvp[x + 4]);
//      rowdiffp += abs(srcp[x + 6] - prvp[x + 6]);
//      rowdiffn += abs(srcp[x + 0] - nxtp[x + 0]);
//      rowdiffn += abs(srcp[x + 2] - nxtp[x + 2]);
//      rowdiffn += abs(srcp[x + 4] - nxtp[x + 4]);
//      rowdiffn += abs(srcp[x + 6] - nxtp[x + 6]);
//    }
//    diffp += rowdiffp;
//    diffn += rowdiffn;
//    prvp += prv_pitch;
//    srcp += src_pitch;
//    nxtp += nxt_pitch;
//  }
//}

bool TFM::checkSceneChange(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int n)
{
  const int bits_per_pixel = vi->format->bitsPerSample;
  if (bits_per_pixel == 8)
    return checkSceneChange_core<uint8_t>(prv, src, nxt, n, bits_per_pixel);
  else
    return checkSceneChange_core<uint16_t>(prv, src, nxt, n, bits_per_pixel);
}

template<typename pixel_t>
bool TFM::checkSceneChange_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
  int n, int bits_per_pixel)
{
  if (sclast.frame == n + 1) return sclast.sc;
  uint64_t diffp = 0;
  uint64_t diffn = 0;
  const uint8_t *prvp = vsapi->getReadPtr(prv, 0);
  const uint8_t *srcp = vsapi->getReadPtr(src, 0);
  const uint8_t *nxtp = vsapi->getReadPtr(nxt, 0);
  const int height = vsapi->getFrameHeight(src, 0) >> 1;
  const int rowsize = vsapi->getFrameHeight(src, 0) * sizeof(pixel_t);
  int width = rowsize / sizeof(pixel_t);
  // this mod16 must be the same as in computing "diffmaxsc"
  
  // safe mod16 rounding for SSE2 in mind
    width = ((width >> 4) << 4); // mod16

  // every 2nd line
  int prv_pitch = vsapi->getStride(prv, 0) << 1;
  int src_pitch = prv_pitch;
  int nxt_pitch = prv_pitch;
  prvp += (1 - field)*(prv_pitch >> 1);
  srcp += (1 - field)*(src_pitch >> 1);
  nxtp += (1 - field)*(nxt_pitch >> 1);

  bool use_sse2 = cpuFlags.sse2;

  if (sclast.frame == n)
  {
    diffp = ((uint64_t)sclast.diff) << (bits_per_pixel - 8);
      if (sizeof(pixel_t) == 1 && use_sse2)
        checkSceneChangePlanar_1_SSE2(srcp, nxtp, height, width, src_pitch, nxt_pitch, diffn);
      else
        checkSceneChangePlanar_1_c<pixel_t>(
          reinterpret_cast<const pixel_t*>(srcp),
          reinterpret_cast<const pixel_t*>(nxtp),
          height, width, 
          src_pitch / sizeof(pixel_t), 
          nxt_pitch / sizeof(pixel_t),
          diffn);
  }
  else
  {
      if (sizeof(pixel_t) == 1 && use_sse2)
        checkSceneChangePlanar_2_SSE2(prvp, srcp, nxtp, height, width, prv_pitch, src_pitch, nxt_pitch, diffp, diffn);
      else
        checkSceneChangePlanar_2_c<pixel_t>(
          reinterpret_cast<const pixel_t*>(prvp), 
          reinterpret_cast<const pixel_t*>(srcp),
          reinterpret_cast<const pixel_t*>(nxtp),
          height, width, 
          prv_pitch / sizeof(pixel_t),
          src_pitch / sizeof(pixel_t),
          nxt_pitch / sizeof(pixel_t),
          diffp, diffn);
  }

  // scale back to 8 bit world
  diffn >>= (bits_per_pixel - 8);
  diffp >>= (bits_per_pixel - 8);
  
//  if (debug)
//  {
//    sprintf(buf, "TFM:  frame %d  - diffp = %u   diffn = %u  diffmaxsc = %u  %c\n", n, (unsigned int)diffp, (unsigned int)diffn, (unsigned int)diffmaxsc,
//      (diffp > diffmaxsc || diffn > diffmaxsc) ? 'T' : 'F');
//    OutputDebugString(buf);
//  }
  sclast.frame = n + 1;
  sclast.diff = (unsigned long)diffn;
  sclast.sc = true;
  if (diffp > diffmaxsc || diffn > diffmaxsc) return true;
  sclast.sc = false;
  return false;
}

void TFM::createWeaveFrame(VSFrameRef *dst, const VSFrameRef *prv, const VSFrameRef *src,
  const VSFrameRef *nxt, int match, int &cfrm) const
{
  if (cfrm == match)
    return;

  const int np = vi->format->numPlanes;
  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    if (match == 0)
    {
      vs_bitblt(vsapi->getWritePtr(dst, plane) + (1 - field)*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(src, plane) + (1 - field)*vsapi->getStride(src, plane), vsapi->getStride(src, plane) << 1,
        vsapi->getFrameWidth(src, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(src, plane) >> 1);
      vs_bitblt(vsapi->getWritePtr(dst, plane) + field*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(prv, plane) + field*vsapi->getStride(prv, plane), vsapi->getStride(prv, plane) << 1,
        vsapi->getFrameWidth(prv, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(prv, plane) >> 1);
    }
    else if (match == 1)
    {
      vs_bitblt(vsapi->getWritePtr(dst, plane), vsapi->getStride(dst, plane), vsapi->getReadPtr(src, plane),
        vsapi->getStride(src, plane), vsapi->getFrameWidth(src, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(src, plane));
    }
    else if (match == 2)
    {
      vs_bitblt(vsapi->getWritePtr(dst, plane) + (1 - field)*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(src, plane) + (1 - field)*vsapi->getStride(src, plane), vsapi->getStride(src, plane) << 1,
        vsapi->getFrameWidth(src, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(src, plane) >> 1);
      vs_bitblt(vsapi->getWritePtr(dst, plane) + field*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(nxt, plane) + field*vsapi->getStride(nxt, plane), vsapi->getStride(nxt, plane) << 1,
        vsapi->getFrameWidth(nxt, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(nxt, plane) >> 1);
    }
    else if (match == 3)
    {
      vs_bitblt(vsapi->getWritePtr(dst, plane) + field*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(src, plane) + field*vsapi->getStride(src, plane), vsapi->getStride(src, plane) << 1,
        vsapi->getFrameWidth(src, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(src, plane) >> 1);
      vs_bitblt(vsapi->getWritePtr(dst, plane) + (1 - field)*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(prv, plane) + (1 - field)*vsapi->getStride(prv, plane), vsapi->getStride(prv, plane) << 1,
        vsapi->getFrameWidth(prv, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(prv, plane) >> 1);
    }
    else if (match == 4)
    {
      vs_bitblt(vsapi->getWritePtr(dst, plane) + field*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(src, plane) + field*vsapi->getStride(src, plane), vsapi->getStride(src, plane) << 1,
        vsapi->getFrameWidth(src, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(src, plane) >> 1);
      vs_bitblt(vsapi->getWritePtr(dst, plane) + (1 - field)*vsapi->getStride(dst, plane), vsapi->getStride(dst, plane) << 1,
        vsapi->getReadPtr(nxt, plane) + (1 - field)*vsapi->getStride(nxt, plane), vsapi->getStride(nxt, plane) << 1,
        vsapi->getFrameWidth(nxt, plane) * vi->format->bytesPerSample, vsapi->getFrameHeight(nxt, plane) >> 1);
    }
//    else throw TIVTCError("TFM:  an unknown error occurred (no such match!)");
  }
  cfrm = match;
}

void TFM::putFrameProperties(VSFrameRef *dst, int match, int combed, bool d2vfilm, const int mics[5]) const
{
    VSMap *props = vsapi->getFramePropsRW(dst);

    vsapi->propSetInt(props, PROP_TFMMATCH, match, paReplace);
    vsapi->propSetInt(props, PROP_Combed, combed > 1, paReplace);
    vsapi->propSetInt(props, PROP_TFMD2VFilm, d2vfilm, paReplace);
    vsapi->propSetInt(props, PROP_TFMField, field, paReplace);
    for (int i = 0; i < 5; i++)
        vsapi->propSetInt(props, PROP_TFMMics, mics[i], i ? paAppend : paReplace);
    vsapi->propSetInt(props, PROP_TFMPP, PP, paReplace);
}

//template<typename pixel_t>
//void TFM::putHint_core(VSFrameRef *dst, int match, int combed, bool d2vfilm)
//{
//  pixel_t *p = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, 0));
//  pixel_t *srcp = p;
//  unsigned int i, hint = 0;
//  unsigned int hint2 = 0, magic_number = 0;

//  if (match == 0) hint |= ISP; /// match
//  else if (match == 1 && combed < 2) hint |= ISC;
//  else if (match == 2) hint |= ISN;
//  else if (match == 3) hint |= ISB;
//  else if (match == 4) hint |= ISU;
//  else if (match == 1 && combed > 1 && field == 0) hint |= ISDB; /// field
//  else if (match == 1 && combed > 1 && field == 1) hint |= ISDT;
//  if (field == 1) hint |= TOP_FIELD; /// field
//  if (combed > 1) hint |= COMBED; /// combed > 1
//  if (d2vfilm) hint |= D2VFILM; /// d2vfilm

//  for (i = 0; i < 32; ++i)
//  {
//    magic_number |= ((*srcp++ & 1) << i);
//  }
//  if (magic_number == MAGIC_NUMBER_2)
//  {
//    for (i = 0; i < 32; ++i)
//    {
//      hint2 |= ((*srcp++ & 1) << i);
//    }
//    hint2 <<= 8;
//    hint2 &= 0xFF00;
//    hint |= hint2 | 0x80;
//  }
//  for (i = 0; i < 32; ++i)
//  {
//    *p &= ~1;
//    *p++ |= ((MAGIC_NUMBER & (1 << i)) >> i);
//  }
//  for (i = 0; i < 32; ++i)
//  {
//    *p &= ~1;
//    *p++ |= ((hint & (1 << i)) >> i);
//  }
//}


// check in TDeint, plus don't call with aligned width!
template<typename pixel_t>
void TFM::buildDiffMapPlane2(const uint8_t *prvp, const uint8_t *nxtp,
  uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int bits_per_pixel) const
{
  do_buildABSDiffMask2<pixel_t>(prvp, nxtp, dstp, prv_pitch, nxt_pitch, dst_pitch, Width, Height, &cpuFlags, bits_per_pixel);
}

// instantiate
template void TFM::buildDiffMapPlane2<uint8_t>(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int bits_per_pixel) const;
template void TFM::buildDiffMapPlane2<uint16_t>(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int bits_per_pixel) const;

template<typename pixel_t>
void TFM::buildABSDiffMask(const uint8_t *prvp, const uint8_t *nxtp,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height) const
{
  do_buildABSDiffMask<pixel_t>(prvp, nxtp, tbuffer.get(), prv_pitch, nxt_pitch, tpitch, width, height, &cpuFlags);
}

// instantiate
template void TFM::buildABSDiffMask<uint8_t>(const uint8_t* prvp, const uint8_t* nxtp,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height) const;
template void TFM::buildABSDiffMask<uint16_t>(const uint8_t* prvp, const uint8_t* nxtp,
  int prv_pitch, int nxt_pitch, int tpitch, int width, int height) const;


//AVSValue __cdecl Create_TFM(AVSValue args, void* user_data, IScriptEnvironment* env)
//{
//  bool chroma = args[16].IsBool() ? args[16].AsBool() : false;
//  VideoInfo vi = args[0].AsClip()->GetVideoInfo();
//  if (vi.IsY()) chroma = false;

//  AVSValue v = new TFM(args[0].AsClip(), args[1].AsInt(-1), args[2].AsInt(-1), args[3].AsInt(1),
//    args[4].AsInt(6), args[5].AsString(""), args[6].AsString(""), args[7].AsString(""), args[8].AsString(""),
//    args[9].AsBool(false), args[10].AsBool(false), args[11].AsInt(1), args[12].AsBool(true),
//    args[13].AsInt(15), args[14].AsInt(9), args[15].AsInt(80), chroma, args[17].AsInt(16),
//    args[18].AsInt(16), args[19].AsInt(0), args[20].AsInt(0), args[23].AsString(""), args[24].AsInt(0),
//    args[25].AsInt(4), args[26].AsFloat(12.0), args[27].AsInt(0), args[28].AsInt(1), args[29].AsString(""),
//    args[30].AsBool(true), args[31].AsInt(0), args[32].AsBool(false), args[33].AsBool(true),
//    args[34].AsBool(true), args[35].AsInt(4), env);
//  if (!args[4].IsInt() || args[4].AsInt() >= 2)
//  {
//    if (!args[4].IsInt() || args[4].AsInt() > 4)
//    {
//      try { v = env->Invoke("InternalCache", v).AsClip(); }
//      catch (IScriptEnvironment::NotFound) {}
//    }
//    v = new TFMPP(v.AsClip(), args[4].AsInt(6), args[21].AsInt(5), args[5].AsString(""),
//      args[10].AsBool(false), (args[22].IsClip() ? args[22].AsClip() : nullptr),
//      args[30].AsBool(true), args[35].AsInt(4), env);
//  }
//  return v;
//}

TFM::TFM(VSNodeRef *_child, int _order, int _field, int _mode, int _PP, const char* _ovr,
  const char* _input, const char* _output, const char * _outputC, bool _debug, bool _display,
  int _slow, bool _mChroma, int _cNum, int _cthresh, int _MI, bool _chroma, int _blockx,
  int _blocky, int _y0, int _y1, const char* _d2v, int _ovrDefault, int _flags, double _scthresh,
  int _micout, int _micmatching, const char* _trimIn, bool _usehints, int _metric, bool _batch,
  bool _ubsco, bool _mmsco, int _opt, const VSAPI *_vsapi, VSCore *core)
    : vsapi(_vsapi), child(_child),
  order(_order), field(_field), mode(_mode), PP(_PP), ovr(_ovr), input(_input), output(_output),
  outputC(_outputC), debug(_debug), display(_display), slow(_slow), mChroma(_mChroma), cNum(_cNum),
  cthresh(_cthresh), MI(_MI), chroma(_chroma), blockx(_blockx), blocky(_blocky), y0(_y0),
  y1(_y1), d2v(_d2v), ovrDefault(_ovrDefault), flags(_flags), scthresh(_scthresh), micout(_micout),
  micmatching(_micmatching), trimIn(_trimIn), usehints(_usehints), metric(_metric),
  batch(_batch), ubsco(_ubsco), mmsco(_mmsco), opt(_opt), cArray(nullptr, nullptr), tbuffer(nullptr, nullptr),
  map(nullptr, nullptr), cmask(nullptr, nullptr)
{
    vi = vsapi->getVideoInfo(child);

  int z, w, q = 0, b, i, count, last, fieldt, firstLine, qt;
  int countOvrS, countOvrM;
  char linein[1024];
  char *linep, *linet;
  std::unique_ptr<FILE, decltype (&fclose)> f(nullptr, nullptr);


  cpuFlags = *getCPUFeatures();
  if (opt == 0) memset(&cpuFlags, 0, sizeof(cpuFlags));

  if (!vi->format || vi->width == 0 || vi->height == 0)
      throw TIVTCError("TFM: the input clip must have constant format and dimensions.");

  if (vi->format->colorFamily == cmGray)
      chroma = false;

  if (vi->format->bitsPerSample > 16)
    throw TIVTCError("TFM:  only 8-16 bit formats supported!");
  if (vi->format->sampleType != stInteger)
      throw TIVTCError("TFM: only integer formats supported!");
  if (vi->format->colorFamily != cmYUV)
    throw TIVTCError("TFM:  YUV data only!");
  if (vi->height & 1 || vi->width & 1)
    throw TIVTCError("TFM:  height and width must be divisible by 2!");
  if (vi->height < 6 || vi->width < 64)
    throw TIVTCError("TFM:  frame dimensions too small!");
  if (mode < 0 || mode > 7)
    throw TIVTCError("TFM:  mode must be set to 0, 1, 2, 3, 4, 5, 6, or 7!");
  if (field < -1 || field > 1)
    throw TIVTCError("TFM:  field must be set to -1, 0, or 1!");
  if (PP < 0 || PP > 7)
    throw TIVTCError("TFM:  PP must be at least 0 and less than 8!");
  if (order < -1 || order > 1)
    throw TIVTCError("TFM:  order must be set to -1, 0, or 1!");
  if (blockx != 4 && blockx != 8 && blockx != 16 && blockx != 32 && blockx != 64 &&
    blockx != 128 && blockx != 256 && blockx != 512 && blockx != 1024 && blockx != 2048)
    throw TIVTCError("TFM:  illegal blockx size!");
  if (blocky != 4 && blocky != 8 && blocky != 16 && blocky != 32 && blocky != 64 &&
    blocky != 128 && blocky != 256 && blocky != 512 && blocky != 1024 && blocky != 2048)
    throw TIVTCError("TFM:  illegal blocky size!");
  if (y0 != y1 && (y0 < 0 || y1 < 0 || y0 > y1 || y1 > vi->height || y0 > vi->height))
    throw TIVTCError("TFM:  bad y0 and y1 exclusion band values!");
  if (ovrDefault < 0 || ovrDefault > 2)
    throw TIVTCError("TFM:  ovrDefault must be set to 0, 1, or 2!");
  if (flags < 0 || flags > 5)
    throw TIVTCError("TFM:  flags must be set to 0, 1, 2, 3, 4, or 5!");
  if (slow < 0 || slow > 2)
    throw TIVTCError("TFM:  slow must be set to 0, 1, or 2!");
  if (micout < 0 || micout > 2)
    throw TIVTCError("TFM:  micout must be set to 0, 1, or 2!");
  if (micmatching < 0 || micmatching > 4)
    throw TIVTCError("TFM:  micmatching must be set to 0, 1, 2, 3, or 4!");
  if (opt < 0 || opt > 4)
    throw TIVTCError("TFM:  opt must be set to 0, 1, 2, 3, or 4!");
  if (metric != 0 && metric != 1)
    throw TIVTCError("TFM:  metric must be set to 0 or 1!");
  if (scthresh < 0.0 || scthresh > 100.0)
    throw TIVTCError("TFM:  scthresh must be between 0.0 and 100.0 (inclusive)!");

//  if (debug)
//  {
//    sprintf(buf, "TFM:  %s by tritical\n", VERSION);
//    OutputDebugString(buf);
//  }

//  child->SetCacheHints(CACHE_GENERIC, 3);  // fixed to diameter (07/30/2005)

  lastMatch.frame = lastMatch.field = lastMatch.combed = lastMatch.match = -20;
  nfrms = vi->numFrames - 1;
  mode_origSaved = mode;
  PP_origSaved = PP;
  MI_origSaved = MI;
  d2vpercent = -20.00f;
  vidCount = 0;

  xhalf = blockx >> 1;
  yhalf = blocky >> 1;
  
  xshift = blockx == 4 ? 2 : blockx == 8 ? 3 : blockx == 16 ? 4 : blockx == 32 ? 5 :
    blockx == 64 ? 6 : blockx == 128 ? 7 : blockx == 256 ? 8 : blockx == 512 ? 9 :
    blockx == 1024 ? 10 : 11;
  yshift = blocky == 4 ? 2 : blocky == 8 ? 3 : blocky == 16 ? 4 : blocky == 32 ? 5 :
    blocky == 64 ? 6 : blocky == 128 ? 7 : blocky == 256 ? 8 : blocky == 512 ? 9 :
    blocky == 1024 ? 10 : 11;

  
  // no high bit depth scaling here
  // Warning: this mod16 must match with the calculation in "checkSceneChange"
  diffmaxsc = int((double(((vi->width >> 4) << 4)*vi->height * (235-16))*scthresh*0.5) / 100.0);

  sclast.frame = -20;
  sclast.sc = true;

  if (mode == 1 || mode == 2 || mode == 3 || mode == 5 || mode == 6 || mode == 7 ||
    PP > 0 || micout > 0 || micmatching > 0)
  {
    cArray = decltype(cArray) (vs_aligned_malloc<int>((((vi->width + xhalf) >> xshift) + 1)*(((vi->height + yhalf) >> yshift) + 1) * 4 * sizeof(int), 16), &vs_aligned_free);
    if (!cArray) {
        throw TIVTCError("TFM:  malloc failure (cArray)!");
    }
    cmask = decltype(cmask) (vsapi->newVideoFrame(vi->format, vi->width, vi->height, nullptr, core), vsapi->freeFrame);
  }

  // prepare map format: always 8 bits
  const VSFormat *map_format = vsapi->registerFormat(vi->format->colorFamily, vi->format->sampleType, 8, vi->format->subSamplingW, vi->format->subSamplingH, core);
  map = decltype(map) (vsapi->newVideoFrame(map_format, vi->width, vi->height, nullptr, core), vsapi->freeFrame);

  if (d2v.size())
  {
    parseD2V();

    trimArray.resize(0);
  }
  order_origSaved = order;
  field_origSaved = fieldO = field;
  if (fieldO == -1)
  {
    if (order == -1) {
        char error[512] = "TFM: Couldn't fetch the first frame from the input clip to determine the clip's field order. Reason: ";
        size_t len = strlen(error);

        const VSFrameRef *first_frame = vsapi->getFrame(0, child, error + len, 512 - len);
        if (first_frame == nullptr) {
            throw TIVTCError(error);
        }
        const VSMap *props = vsapi->getFramePropsRO(first_frame);

        int err;
        int64_t field_based = vsapi->propGetInt(props, "_FieldBased", 0, &err);
        vsapi->freeFrame(first_frame);
        if (err) {
            throw TIVTCError("TFM: Couldn't find the '_FieldBased' frame property. The 'order' parameter must be used.");
        }

        /// Pretend it's top field first when it says progressive?
        fieldO = (field_based == TopFieldFirst || field_based == Progressive);

//        fieldO = child->GetParity(0) ? 1 : 0;
    }
    else fieldO = order;
  }
  tpitchy = tpitchuv = -20;
  
  const int ALIGN_BUF = 64;

  // Rounds up the number "n" to the next greater multiple of "align"
#define ALIGN_NUMBER(n, align) (((n) + (align)-1) & (~((align)-1)))

  {
    // tbuffer is 8 or 16 bits wide
    const int pixelsize = vi->format->bytesPerSample;
    tpitchy = ALIGN_NUMBER(vi->width * pixelsize, ALIGN_BUF);
    const int widthUV = vi->format->numPlanes > 1 ? vi->width >> vi->format->subSamplingW : 0;
    tpitchuv = ALIGN_NUMBER(widthUV * pixelsize, ALIGN_BUF);
  }
#undef ALIGN_NUMBER

  // 16 would be is enough for sse2 but maybe we'll do AVX2?
  tbuffer = decltype(tbuffer) (vs_aligned_malloc<uint8_t>((vi->height >> 1) * tpitchy, ALIGN_BUF), &vs_aligned_free);
  if (!tbuffer) throw TIVTCError("TFM:  malloc failure (tbuffer)!");
  mode7_field = field;
  if (input.size())
  {
    bool d2vmarked, micmarked;
    if ((f = decltype (f)(tivtc_fopen(input.c_str(), "r"), &fclose)) != nullptr)
    {
      ovrArray.resize(vi->numFrames, 255);

      if (d2vfilmarray.size() == 0)
      {
        d2vfilmarray.resize(vi->numFrames + 1, 0);
      }
      fieldt = fieldO;
      firstLine = 0;
//      if (debug)
//      {
//        sprintf(buf, "TFM:  successfully opened input file.  Field defaulting to - %s.\n",
//          fieldt == 0 ? "bottom" : "top");
//        OutputDebugString(buf);
//      }
      while (fgets(linein, 1024, f.get()) != nullptr)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
          continue;
        ++firstLine;
        linep = linein;
        while (*linep != 'f' && *linep != 'F' && *linep != 0 && *linep != ' ' && *linep != 'c') linep++;
        if (*linep == 'f' || *linep == 'F')
        {
          if (firstLine == 1)
          {
            bool changed = false;
            if (_strnicmp(linein, "field = top", 11) == 0) { fieldt = 1; changed = true; }
            else if (_strnicmp(linein, "field = bottom", 14) == 0) { fieldt = 0; changed = true; }
//            if (debug && changed)
//            {
//              sprintf(buf, "TFM:  detected field for input file - %s.\n",
//                fieldt == 0 ? "bottom" : "top");
//              OutputDebugString(buf);
//            }
          }
        }
        else if (*linep == 'c')
        {
          if (_strnicmp(linein, "crc32 = ", 8) == 0)
          {
            linet = linein;
            while (*linet != ' ') linet++;
            linet++;
            while (*linet != ' ') linet++;
            linet++;
            unsigned int m, tempCrc;
            sscanf(linet, "%x", &m);
            calcCRC(child, 15, tempCrc, vsapi);
            if (tempCrc != m && !batch)
            {
              throw TIVTCError("TFM:  crc32 in input file does not match that of the current clip!");
            }
          }
        }
        else if (*linep == ' ')
        {
          linet = linein;
          while (*linet != 0)
          {
            if (*linet != ' ' && *linet != 10) break;
            linet++;
          }
          if (*linet == 0) { --firstLine; continue; }
          sscanf(linein, "%d", &z);
          linep = linein;
          while (*linep != 'p' && *linep != 'c' && *linep != 'n' && *linep != 'u' &&
            *linep != 'b' && *linep != 'l' && *linep != 'h' && *linep != 0) linep++;
          if (*linep != 0)
          {
            if (z<0 || z>nfrms)
            {
              throw TIVTCError("TFM:  input file error (out of range or non-ascending frame #)!");
            }
            linep = linein;
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep != 0)
            {
              qt = -1;
              d2vmarked = micmarked = false;
              linep++;
              q = *linep;
              if (q == 112) q = 0;
              else if (q == 99) q = 1;
              else if (q == 110) q = 2;
              else if (q == 98) q = 3;
              else if (q == 117) q = 4;
              else if (q == 108) q = 5;
              else if (q == 104) q = 6;
              else
              {
                throw TIVTCError("TFM:  input file error (invalid match specifier)!");
              }
              linep++;
              linep++;
              if (*linep != 0)
              {
                qt = *linep;
                if (qt == 45) qt = 0;
                else if (qt == 43) qt = COMBED;
                else if (qt == '1') { d2vmarked = true; qt = -1; }
                else if (qt == '[') { micmarked = true; qt = -1; }
                else
                {
                  throw TIVTCError("TFM:  input file error (invalid specifier)!");
                }
              }
              if (fieldt != fieldO)
              {
                if (q == 0) q = 3;
                else if (q == 2) q = 4;
                else if (q == 3) q = 0;
                else if (q == 4) q = 2;
              }
              if (!d2vmarked && !micmarked && qt != -1)
              {
                linep++;
                linep++;
                if (*linep == '1') d2vmarked = true;
                else if (*linep == '[') micmarked = true;
              }
              if (d2vmarked)
              {
                d2vfilmarray[z] &= ~0x03;
                d2vfilmarray[z] |= fieldt == 1 ? 0x3 : 0x1;
                if (!micmarked)
                {
                  linep++;
                  linep++;
                  if (*linep == '[') micmarked = true;
                }
              }
              if (micmarked)
              {
                // add mic input handling in the future
              }
              ovrArray[z] |= 0x07;
              ovrArray[z] &= (q | 0xF8);
              if (qt != -1)
              {
                ovrArray[z] &= 0xDF;
                ovrArray[z] |= 0x10;
                ovrArray[z] &= (qt | 0xEF);
              }
            }
          }
        }
      }
    }
    else throw TIVTCError("TFM:  input file error (could not open file)!");
  }
  if (ovr.size())
  {
    if ((f = decltype (f)(tivtc_fopen(ovr.c_str(), "r"), &fclose)) != nullptr)
    {
      countOvrS = countOvrM = 0;
      while (fgets(linein, 1024, f.get()) != nullptr)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
          continue;
        linep = linein;
        while (*linep != 'c' && *linep != 'p' && *linep != 'n' && *linep != 'b' &&
          *linep != 'u' && *linep != 'l' && *linep != 'h' && *linep != '+' && *linep != '-' && *linep != 0) linep++;
        if (*linep == 0) ++countOvrS;
        else ++countOvrM;
      }
      if (ovrDefault != 0 && ovrArray.size())
      {
        if (ovrDefault == 1) q = 0;
        else if (ovrDefault == 2) q = COMBED;
        for (int h = 0; h < vi->numFrames; ++h)
        {
          ovrArray[h] &= 0xDF;
          ovrArray[h] |= 0x10;
          ovrArray[h] &= (q | 0xEF);
          if (q == 0 && ((ovrArray[h] & 7) == 6 ||
            (ovrArray[h] & 7) == 5))
          {
            ovrArray[h] |= 0x07;
            ovrArray[h] &= (1 | 0xF8);
          }
        }
      }
      if (countOvrS == 0 && countOvrM == 0) { goto emptyovr; }
      if (countOvrS > 0)
      {
        ++countOvrS;
        countOvrS *= 4;
        setArray.resize(countOvrS, 0xffffffff);
      }
      if (countOvrM > 0 && ovrArray.size() == 0)
      {
        ovrArray.resize(vi->numFrames, 255);
        if (ovrDefault != 0)
        {
          if (ovrDefault == 1) q = 0;
          else if (ovrDefault == 2) q = COMBED;
          for (int h = 0; h < vi->numFrames; ++h)
          {
            ovrArray[h] &= 0xDF;
            ovrArray[h] |= 0x10;
            ovrArray[h] &= (q | 0xEF);
          }
        }
      }
      last = -1;
      fieldt = fieldO;
      firstLine = 0;
      i = 0;
      if ((f = decltype (f)(tivtc_fopen(ovr.c_str(), "r"), &fclose)) != nullptr)
      {
//        if (debug)
//        {
//          sprintf(buf, "TFM:  successfully opened ovr file.  Field defaulting to - %s.\n",
//            fieldt == 0 ? "bottom" : "top");
//          OutputDebugString(buf);
//        }
        while (fgets(linein, 1024, f.get()) != nullptr)
        {
          if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
            continue;
          ++firstLine;
          linep = linein;
          while (*linep != 'f' && *linep != 'F' && *linep != 0 && *linep != ' ' && *linep != ',') linep++;
          if (*linep == 'f' || *linep == 'F')
          {
            if (firstLine == 1)
            {
              bool changed = false;
              if (_strnicmp(linein, "field = top", 11) == 0) { fieldt = 1; changed = true; }
              else if (_strnicmp(linein, "field = bottom", 14) == 0) { fieldt = 0; changed = true; }
//              if (debug && changed)
//              {
//                sprintf(buf, "TFM:  detected field for ovr file - %s.\n",
//                  fieldt == 0 ? "bottom" : "top");
//                OutputDebugString(buf);
//              }
            }
          }
          else if (*linep == ' ')
          {
            linet = linein;
            while (*linet != 0)
            {
              if (*linet != ' ' && *linet != 10) break;
              linet++;
            }
            if (*linet == 0) { --firstLine; continue; }
            linep++;
            if (*linep == 'p' || *linep == 'c' || *linep == 'n' || *linep == 'b' || *linep == 'u' || *linep == 'l' || *linep == 'h')
            {
              sscanf(linein, "%d", &z);
              if (z<0 || z>nfrms || z <= last)
              {
                throw TIVTCError("TFM:  ovr file error (out of range or non-ascending frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                q = *linep;
                if (q == 112) q = 0;
                else if (q == 99) q = 1;
                else if (q == 110) q = 2;
                else if (q == 98) q = 3;
                else if (q == 117) q = 4;
                else if (q == 108) q = 5;
                else if (q == 104) q = 6;
                else
                {
                  throw TIVTCError("TFM:  ovr file error (invalid match specifier)!");
                }
                if (fieldt != fieldO)
                {
                  if (q == 0) q = 3;
                  else if (q == 2) q = 4;
                  else if (q == 3) q = 0;
                  else if (q == 4) q = 2;
                }
                ovrArray[z] |= 0x07;
                ovrArray[z] &= (q | 0xF8);
                last = z;
              }
            }
            else if (*linep == '-' || *linep == '+')
            {
              sscanf(linein, "%d", &z);
              if (z<0 || z>nfrms)
              {
                throw TIVTCError("TFM:  ovr file error (out of range or non-ascending frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                q = *linep;
                if (q == 45) q = 0;
                else if (q == 43) q = COMBED;
                else
                {
                  throw TIVTCError("TFM:  ovr file error (invalid symbol)!");
                }
                ovrArray[z] &= 0xDF;
                ovrArray[z] |= 0x10;
                ovrArray[z] &= (q | 0xEF);
                if (q == 0 && ((ovrArray[z] & 7) == 6 ||
                  (ovrArray[z] & 7) == 5))
                {
                  ovrArray[z] |= 0x07;
                  ovrArray[z] &= (1 | 0xF8);
                }
              }
            }
            else
            {
              sscanf(linein, "%d", &z);
              if (z<0 || z>nfrms)
              {
                throw TIVTCError("TFM:  ovr input error (out of range frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*linep == 'f' || *linep == 'm' || *linep == 'o' || *linep == 'P' || *linep == 'i')
                {
                  q = *linep;
                  linep++;
                  linep++;
                  if (*linep == 0) continue;
                  sscanf(linep, "%d", &b);
                  if (q == 102 && b != 0 && b != 1 && b != -1)
                  {
                    throw TIVTCError("TFM:  ovr input error (bad field value)!");
                  }
                  else if (q == 111 && b != 0 && b != 1 && b != -1)
                  {
                    throw TIVTCError("TFM:  ovr input error (bad order value)!");
                  }
                  else if (q == 109 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFM:  ovr input error (bad mode value)!");
                  }
                  else if (q == 80 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFM:  ovr input error (bad PP value)!");
                  }
                  setArray[i] = q; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = b; ++i;
                }
              }
            }
          }
          else if (*linep == ',')
          {
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++;
            if (*linep == 'p' || *linep == 'c' || *linep == 'n' || *linep == 'u' || *linep == 'b' || *linep == 'l' || *linep == 'h')
            {
              sscanf(linein, "%d,%d", &z, &w);
              if (w == 0) w = nfrms;
              if (z<0 || z>nfrms || w<0 || w>nfrms || w < z || z <= last)
              {
                throw TIVTCError("TFM:  input file error (out of range or non-ascending frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*(linep + 1) == 'p' || *(linep + 1) == 'c' || *(linep + 1) == 'n' || *(linep + 1) == 'b' || *(linep + 1) == 'u' || *(linep + 1) == 'l' || *(linep + 1) == 'h')
                {
                  count = 0;
                  while ((*linep == 'p' || *linep == 'c' || *linep == 'n' || *linep == 'b' || *linep == 'u' || *linep == 'l' || *linep == 'h') && (z + count <= w))
                  {
                    q = *linep;
                    if (q == 112) q = 0;
                    else if (q == 99) q = 1;
                    else if (q == 110) q = 2;
                    else if (q == 98) q = 3;
                    else if (q == 117) q = 4;
                    else if (q == 108) q = 5;
                    else if (q == 104) q = 6;
                    else
                    {
                      throw TIVTCError("TFM:  input file error (invalid match specifier)!");
                    }
                    if (fieldt != fieldO)
                    {
                      if (q == 0) q = 3;
                      else if (q == 2) q = 4;
                      else if (q == 3) q = 0;
                      else if (q == 4) q = 2;
                    }
                    ovrArray[z + count] |= 0x07;
                    ovrArray[z + count] &= (q | 0xF8);
                    ++count;
                    linep++;
                  }
                  while (z + count <= w)
                  {
                    ovrArray[z + count] |= 0x07;
                    ovrArray[z + count] &= (ovrArray[z] | 0xF8);
                    ++z;
                  }
                  last = w;
                }
                else
                {
                  q = *linep;
                  if (q == 112) q = 0;
                  else if (q == 99) q = 1;
                  else if (q == 110) q = 2;
                  else if (q == 98) q = 3;
                  else if (q == 117) q = 4;
                  else if (q == 108) q = 5;
                  else if (q == 104) q = 6;
                  else
                  {
                    throw TIVTCError("TFM:  input file error (invalid match specifier)!");
                  }
                  if (fieldt != fieldO)
                  {
                    if (q == 0) q = 3;
                    else if (q == 2) q = 4;
                    else if (q == 3) q = 0;
                    else if (q == 4) q = 2;
                  }
                  while (z <= w)
                  {
                    ovrArray[z] |= 0x07;
                    ovrArray[z] &= (q | 0xF8);
                    ++z;
                  }
                  last = w;
                }
              }
            }
            else if (*linep == '-' || *linep == '+')
            {
              sscanf(linein, "%d,%d", &z, &w);
              if (w == 0) w = nfrms;
              if (z<0 || z>nfrms || w<0 || w>nfrms || w < z)
              {
                throw TIVTCError("TFM:  input file error (out of range or non-ascending frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*(linep + 1) == '-' || *(linep + 1) == '+')
                {
                  count = 0;
                  while ((*linep == '-' || *linep == '+') && (z + count <= w))
                  {
                    q = *linep;
                    if (q == 45) q = 0;
                    else if (q == 43) q = COMBED;
                    else
                    {
                      throw TIVTCError("TFM:  input file error (invalid symbol)!");
                    }
                    ovrArray[z + count] &= 0xDF;
                    ovrArray[z + count] |= 0x10;
                    ovrArray[z + count] &= (q | 0xEF);
                    if (q == 0 && ((ovrArray[z + count] & 7) == 6 ||
                      (ovrArray[z + count] & 7) == 5))
                    {
                      ovrArray[z + count] |= 0x07;
                      ovrArray[z + count] &= (1 | 0xF8);
                    }
                    ++count;
                    linep++;
                  }
                  while (z + count <= w)
                  {
                    ovrArray[z + count] &= 0xDF;
                    ovrArray[z + count] |= 0x10;
                    ovrArray[z + count] &= (ovrArray[z] | 0xEF);
                    if ((ovrArray[z] & 0x10) == 0 && ((ovrArray[z + count] & 7) == 6 ||
                      (ovrArray[z + count] & 7) == 5))
                    {
                      ovrArray[z + count] |= 0x07;
                      ovrArray[z + count] &= (1 | 0xF8);
                    }
                    ++z;
                  }
                }
                else
                {
                  q = *linep;
                  if (q == 45) q = 0;
                  else if (q == 43) q = COMBED;
                  else
                  {
                    throw TIVTCError("TFM:  input file error (invalid symbol)!");
                  }
                  while (z <= w)
                  {
                    ovrArray[z] &= 0xDF;
                    ovrArray[z] |= 0x10;
                    ovrArray[z] &= (q | 0xEF);
                    if (q == 0 && ((ovrArray[z] & 7) == 6 ||
                      (ovrArray[z] & 7) == 5))
                    {
                      ovrArray[z] |= 0x07;
                      ovrArray[z] &= (1 | 0xF8);
                    }
                    ++z;
                  }
                }
              }
            }
            else
            {
              sscanf(linein, "%d,%d", &z, &w);
              if (w == 0) w = nfrms;
              if (z<0 || z>nfrms || w<0 || w>nfrms || w < z)
              {
                throw TIVTCError("TFM: ovr input error (invalid frame range)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*linep == 'f' || *linep == 'm' || *linep == 'o' || *linep == 'P' || *linep == 'i')
                {
                  q = *linep;
                  linep++;
                  linep++;
                  if (*linep == 0) continue;
                  sscanf(linep, "%d", &b);
                  if (q == 102 && b != 0 && b != 1 && b != -1)
                  {
                    throw TIVTCError("TFM:  ovr input error (bad field value)!");
                  }
                  else if (q == 111 && b != 0 && b != 1 && b != -1)
                  {
                    throw TIVTCError("TFM:  ovr input error (bad order value)!");
                  }
                  else if (q == 109 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFM:  ovr input error (bad mode value)!");
                  }
                  else if (q == 80 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFM:  ovr input error (bad PP value)!");
                  }
                  setArray[i] = q; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = w; ++i;
                  setArray[i] = b; ++i;
                }
              }
            }
          }
        }
      }
      else {
          throw TIVTCError("TFM:  ovr file error (could not open file)!");
      }
    }
    else {
        throw TIVTCError("TFM:  ovr input error (could not open ovr file)!");
    }
  }
emptyovr:
  if (output.size())
  {
    if ((f = decltype (f)(tivtc_fopen(output.c_str(), "w"), &fclose)) != nullptr)
    {
      _fullpath(outputFull, output.c_str(), MAX_PATH);
      calcCRC(child, 15, outputCrc, vsapi);
      outArray.resize(vi->numFrames, 0);
      moutArray.resize(vi->numFrames, -1);
      if (micout > 0)
      {
        int sn = micout == 1 ? 3 : 5;
        moutArrayE.resize(vi->numFrames * sn, -20);
      }
    }
    else {
        throw TIVTCError("TFM:  output file error (cannot create file)!");
    }
  }
  if (outputC.size())
  {
    if ((f = decltype (f)(tivtc_fopen(outputC.c_str(), "w"), &fclose)) != nullptr)
    {
      _fullpath(outputCFull, outputC.c_str(), MAX_PATH);
      if (outArray.size() == 0)
      {
        outArray.resize(vi->numFrames, 0);
      }
    }
    else {
        throw TIVTCError("TFM:  outputC file error (cannot create file)!");
    }
  }
  /// attach the value of PP to the first frame? TDecimate uses this to do something in the constructor while processing the tfmIn file.
  ///
//  AVSValue tfmPassValue(PP);
//  const char *varname = "TFMPPValue";
//  env->SetVar(varname, tfmPassValue);
}

TFM::~TFM()
{
  if (outArray.size())
  {
    FILE *f = nullptr;
    if (output.size())
    {
      if ((f = tivtc_fopen(outputFull, "w")) != nullptr)
      {
        char tempBuf[40], tb2[40];
        int match, sn = micout == 1 ? 3 : 5;
        if (moutArrayE.size())
        {
          for (int i = 0; i < sn * vi->numFrames; ++i)
          {
            if (moutArrayE[i] == -20) moutArrayE[i] = -1;
          }
        }
        fprintf(f, "#TFM %s by tritical\n", VERSION);
        fprintf(f, "field = %s\n", fieldO == 1 ? "top" : "bottom");
        fprintf(f, "crc32 = %x\n", outputCrc);
        for (int h = 0; h <= nfrms; ++h)
        {
          if (outArray[h] & FILE_ENTRY)
          {
            match = (outArray[h] & 0x07);
            sprintf(tempBuf, "%d %c", h, MTC(match));
            if (outArray[h] & 0x20)
            {
              if (outArray[h] & 0x10) strcat(tempBuf, " +");
              else strcat(tempBuf, " -");
            }
            if (outArray[h] & FILE_D2V) strcat(tempBuf, " 1");
            if (moutArray.size() && moutArray[h] != -1)
            {
              sprintf(tb2, " [%d]", moutArray[h]);
              strcat(tempBuf, tb2);
            }
            if (moutArrayE.size())
            {
              int th = h*sn;
              if (sn == 3) sprintf(tb2, " (%d %d %d)", moutArrayE[th + 0],
                moutArrayE[th + 1], moutArrayE[th + 2]);
              else sprintf(tb2, " (%d %d %d %d %d)", moutArrayE[th + 0],
                moutArrayE[th + 1], moutArrayE[th + 2], moutArrayE[th + 3],
                moutArrayE[th + 4]);
              strcat(tempBuf, tb2);
            }
            strcat(tempBuf, "\n");
            fprintf(f, "%s", tempBuf);
          }
        }
        generateOvrHelpOutput(f);
        fclose(f);
        f = nullptr;
      }
    }
    if (outputC.size())
    {
      if ((f = tivtc_fopen(outputCFull, "w")) != nullptr)
      {
        int count = 0, match;
        fprintf(f, "#TFM %s by tritical\n", VERSION);
        for (int h = 0; h <= nfrms; ++h)
        {
          if (outArray[h] & FILE_ENTRY) match = (outArray[h] & 0x07);
          else match = 0;
          if (match == 1 || match == 5 || match == 6) ++count;
          else
          {
            if (count > cNum) fprintf(f, "%d,%d\n", h - count, h - 1);
            count = 0;
          }
        }
        if (count > cNum) fprintf(f, "%d,%d\n", nfrms - count + 1, nfrms);
        fclose(f);
        f = nullptr;
      }
    }
    if (f != nullptr) fclose(f);
  }

  vsapi->freeNode(child);
}

void TFM::generateOvrHelpOutput(FILE *f) const
{
  int ccount = 0, mcount = 0, acount = 0;
  int ordert = /*order == -1 ? child->GetParity(0) :*/ order; /// can order be -1 at this point? I think not, but test it
  int ao = fieldO^ordert ? 0 : 2;
  for (int i = 0; i < vi->numFrames; ++i)
  {
    if (!(outArray[i] & FILE_ENTRY)) return;
    const int temp = outArray[i] & 0x07;
    if (temp == 3 || temp == 4 || temp == ao) ++acount;
    if (moutArray[i] != -1) ++mcount;
    if ((outArray[i] & 0x30) == 0x30) ++ccount;
  }
  fprintf(f, "#\n#\n# OVR HELP INFORMATION:\n#\n");
  fprintf(f, "# [COMBED FRAMES]\n#\n");
  fprintf(f, "#   [Individual Frames]\n");
  fprintf(f, "#   FORMAT:  frame_number (mic_value)\n#\n");
  if (PP == 0) fprintf(f, "#   none detected (PP=0)\n");
  else if (ccount)
  {
    for (int i = 0; i < vi->numFrames; ++i)
    {
      if ((outArray[i] & 0x30) == 0x30)
      {
        if (moutArray[i] < 0) fprintf(f, "#   %d\n", i);
        else fprintf(f, "#   %d (%d)\n", i, moutArray[i]);
      }
    }
  }
  else fprintf(f, "#   none detected\n");
  fprintf(f, "#\n#   [Grouped Ranges Allowing Small Breaks]\n");
  fprintf(f, "#   FORMAT:  frame_start, frame_end (percentage combed)\n#\n");
  if (PP == 0) fprintf(f, "#   none detected (PP=0)\n");
  else if (ccount)
  {
    int icount = 0, pcount = 0, rcount = 0, i = 0;
    for (; i < vi->numFrames; ++i)
    {
      if ((outArray[i] & 0x30) == 0x30)
      {
        ++icount;
        ++rcount;
        pcount = 0;
      }
      else
      {
        ++pcount;
        if (rcount > 0) ++rcount;
        if (pcount > 12)
        {
          if (icount > 1)
            fprintf(f, "#   %d,%d (%3.1f%c)\n", i - rcount + 1, i - pcount,
              icount*100.0 / double(rcount - pcount), '%');
          rcount = icount = 0;
        }
      }
    }
    if (icount > 1)
      fprintf(f, "#   %d,%d (%3.1f%c)\n", i - rcount, i - pcount,
        icount*100.0 / double(rcount - pcount), '%');
  }
  else fprintf(f, "#   none detected\n");
  fprintf(f, "#\n#\n# [POSSIBLE MISSED COMBED FRAMES]\n#\n");
  fprintf(f, "#   FORMAT:  frame_number (mic_value)\n#\n");
  if (PP == 0) fprintf(f, "#   none detected (PP=0)\n");
  else if (mcount)
  {
    int maxcp = int(MI*0.85), count = 0;
    int mt = std::max(int(MI*0.1875), 5);
    for (int i = 0; i < vi->numFrames; ++i)
    {
      if ((outArray[i] & 0x30) == 0x30)
        continue;
      const int prev = i > 0 ? moutArray[i - 1] : 0;
      const int curr = moutArray[i];
      const int next = i < vi->numFrames - 1 ? moutArray[i + 1] : 0;
      if (curr <= MI && ((curr >= mt && curr > next * 2 && curr > prev * 2 &&
        curr - next > mt && curr - prev > mt) || (curr > maxcp) ||
        (prev > MI && next > MI && curr > MI*0.5) ||
        ((prev > MI || next > MI) && curr > MI*0.75)))
      {
        fprintf(f, "#   %d (%d)\n", i, moutArray[i]);
        ++count;
      }
    }
    if (!count) fprintf(f, "#   none detected\n");
  }
  else fprintf(f, "#   none detected\n");
  fprintf(f, "#\n#\n# [u, b, AND AGAINST ORDER (%c) MATCHES]\n#\n", MTC(ao));
  fprintf(f, "#   FORMAT:  frame_number match  or  range_start,range_end match\n#\n");
  if (acount)
  {
    int lastf = -1, count = 0, i = 0;
    for (; i < vi->numFrames; ++i)
    {
      const int temp = outArray[i] & 0x07;
      if (temp == 3 || temp == 4 || temp == ao)
      {
        if (lastf == -1) lastf = temp;
        else if (temp != lastf)
        {
          if (count == 1) fprintf(f, "#   %d %c\n", i - 1, MTC(lastf));
          else fprintf(f, "#   %d,%d %c\n", i - count, i - 1, MTC(lastf));
          count = 0;
          lastf = temp;
        }
        ++count;
      }
      else if (count)
      {
        if (count == 1) fprintf(f, "#   %d %c\n", i - 1, MTC(lastf));
        else fprintf(f, "#   %d,%d %c\n", i - count, i - 1, MTC(lastf));
        count = 0;
        lastf = -1;
      }
    }
    if (count == 1) fprintf(f, "#   %d %c\n", i - 1, MTC(lastf));
    else if (count > 1) fprintf(f, "#   %d,%d %c\n", i - count, i - 1, MTC(lastf));
  }
  else fprintf(f, "#   none detected\n");
}
07070100000010000081A4000000000000000000000001671240C900002620000000000000000000000000000000000000002900000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFM.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <stdio.h>
#include <xmmintrin.h>
#ifndef _WIN32
#include <limits.h>
#include <stdlib.h>
#include <strings.h>
#define _strnicmp strncasecmp
#define _fullpath(absolute, relative, max) realpath((relative), (absolute))
#define MAX_PATH PATH_MAX
#else
#include <windows.h>
#endif
#include <memory>
#include <vector>
#include <string>
#include <VapourSynth.h>
#include <VSHelper.h>
#include "calcCRC.h"
#include "internal.h"
#include "cpufeatures.h"


template<int planarType>
void FillCombedPlanarUpdateCmaskByUV(VSFrameRef* cmask, const VSAPI *vsapi);

template<typename pixel_t>
void checkCombedPlanarAnalyze_core(const VSVideoInfo *vi, int cthresh, bool chroma, int cpuFlags, int metric, const VSFrameRef *src, VSFrameRef* cmask, const VSAPI *vsapi);

struct MTRACK {
  int frame, match;
  int field, combed;
};

struct SCTRACK {
  int frame;
  unsigned long diff;
  bool sc;
};

class TFM
{
private:
    const VSAPI *vsapi;
    VSNodeRef *child;

  CPUFeatures cpuFlags;

  int order, field, mode; // modified in GetFrame
  int PP; // modified in GetFrame
  // TFM must store a copy of the string obtained from propGetData, because that pointer doesn't live forever.
  std::string ovr; // override file name
  std::string input;
  std::string output;
  std::string outputC;
  bool debug, display;
  int slow;
  bool mChroma;
  int cNum;
  int cthresh;
  int MI; // modified in GetFrame
  bool chroma;
  int blockx, blocky;
  int y0, y1; // band exclusion
  std::string d2v;
  int ovrDefault;
  int flags;
  double scthresh;
  int micout, micmatching;
  std::string trimIn;
  bool usehints;
  bool metric;
  bool batch, ubsco, mmsco;
  int opt;

  int PP_origSaved, MI_origSaved;
  int order_origSaved, field_origSaved, mode_origSaved;
  int nfrms;
  int xhalf, yhalf, xshift, yshift;
  int vidCount, fieldO, mode7_field; // mode7_field modified in GetFrame, but only when mode is 7
  uint32_t outputCrc;
  unsigned long diffmaxsc;
  
  std::unique_ptr<int, decltype (&vs_aligned_free)> cArray; // modified in GetFrame
  std::vector<int> setArray;

  std::vector<bool> trimArray;

  double d2vpercent;
  
  std::vector<uint8_t> ovrArray;
  std::vector<uint8_t> outArray; // modified in GetFrame, but only the element corresponding to frame n, so multithreaded access is fine
  std::vector<uint8_t> d2vfilmarray;

  std::unique_ptr<uint8_t, decltype (&vs_aligned_free)> tbuffer; // absdiff buffer // modified in GetFrame
  int tpitchy, tpitchuv;

  std::vector<int> moutArray; // modified in GetFrame, but only the element corresponding to frame n
  std::vector<int> moutArrayE; // modified in GetFrame, but only the elements corresponding to frame n
  
  MTRACK lastMatch; // modified in GetFrame
  SCTRACK sclast;  // modified in GetFrame
  char outputFull[MAX_PATH], outputCFull[MAX_PATH];
  std::unique_ptr<VSFrameRef, decltype (VSAPI::freeFrame)> map; // modified in GetFrame
  std::unique_ptr<VSFrameRef, decltype (VSAPI::freeFrame)> cmask; // modified in GetFrame

  template<typename pixel_t>
  void buildDiffMapPlane_Planar(const uint8_t *prvp, const uint8_t *nxtp,
    uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
    int Width, int tpitch, int bits_per_pixel);
//  void buildDiffMapPlaneYUY2(const uint8_t *prvp, const uint8_t *nxtp,
//    uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
//    int Width, int tpitch, IScriptEnvironment *env);
  
  template<typename pixel_t>
  void buildDiffMapPlane2(const uint8_t *prvp, const uint8_t *nxtp,
    uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
    int Width, int bits_per_pixel) const;

  void fileOut(int match, int combed, bool d2vfilm, int n, int MICount, int mics[5]);

  int compareFields(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
    int match2, int &norm1, int &norm2, int &mtn1, int &mtn2, int n);
  template<typename pixel_t>
  int compareFields_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
    int match2, int& norm1, int& norm2, int& mtn1, int& mtn2, int n);

  int compareFieldsSlow(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
    int match2, int &norm1, int &norm2, int &mtn1, int &mtn2, int n);
  template<typename pixel_t>
  int compareFieldsSlow_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
    int match2, int& norm1, int& norm2, int& mtn1, int& mtn2, int n);
  template<typename pixel_t>
  int compareFieldsSlow2_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int match1,
    int match2, int& norm1, int& norm2, int& mtn1, int& mtn2, int n);

  void createWeaveFrame(VSFrameRef *dst, const VSFrameRef *prv, const VSFrameRef *src,
    const VSFrameRef *nxt, int match, int &cfrm) const;
  
  bool getMatchOvr(int n, int &match, int &combed, bool &d2vmatch, bool isSC);
  void getSettingOvr(int n);
  
  bool checkCombed(const VSFrameRef *src, int n, int match,
    int *blockN, int &xblocksi, int *mics, bool ddebug);
  bool checkCombedPlanar(const VSFrameRef *src, int n, int match,
    int *blockN, int &xblocksi, int *mics, bool ddebug, bool _chroma);
  template<typename pixel_t>
  bool checkCombedPlanar_core(const VSFrameRef *src, int n, int match,
    int* blockN, int& xblocksi, int* mics, bool ddebug, int bits_per_pixel);
//  bool checkCombedYUY2(const VSFrameRef *src, int n, int match,
//    int *blockN, int &xblocksi, int *mics, bool ddebug, bool chroma,int cthresh);
  
  void writeDisplay(VSFrameRef *dst, int n, int fmatch, int combed, bool over,
    int blockN, int xblocks, bool d2vmatch, int *mics, const VSFrameRef *prv,
    const VSFrameRef *src, const VSFrameRef *nxt);

  void putFrameProperties(VSFrameRef *dst, int match, int combed, bool d2vfilm, const int mics[5]) const;
//  template<typename pixel_t>
//  void putHint_core(VSFrameRef *dst, int match, int combed, bool d2vfilm);

  void parseD2V();
  int D2V_find_and_correct(std::vector<int> &array, bool &found, int &tff) const;
  void D2V_find_fix(int a1, int a2, int sync, int &f1, int &f2, int &change) const;
  bool D2V_check_illegal(int a1, int a2) const;
  int D2V_check_final(const std::vector<int> &array) const;
  int D2V_initialize_array(std::vector<int> &array, int &d2vtype, int &frames) const;
  int D2V_write_array(const std::vector<int> &array, char wfile[]) const;
  int D2V_get_output_filename(char wfile[]) const;
  int D2V_fill_d2vfilmarray(const std::vector<int> &array, int frames);
  bool d2vduplicate(int match, int combed, int n);
  bool checkD2VCase(int check) const;
  bool checkInPatternD2V(const std::vector<int> &array, int i) const;
  int fillTrimArray(int frames);

  bool checkSceneChange(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int n);
  template<typename pixel_t>
  bool checkSceneChange_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
    int n, int bits_per_pixel);

  void micChange(int n, int m1, int m2, VSFrameRef *dst, const VSFrameRef *prv,
    const VSFrameRef *src, const VSFrameRef *nxt, int &fmatch,
    int &combed, int &cfrm) const;
  void checkmm(int &cmatch, int m1, int m2, VSFrameRef *dst, int &dfrm, VSFrameRef *tmp, int &tfrm,
    const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt, int n,
    int *blockN, int &xblocks, int *mics);

  // O.K. common parts with TDeint
  // fixme: hbd!
  template<typename pixel_t>
  void buildABSDiffMask(const uint8_t *prvp, const uint8_t *nxtp,
    int prv_pitch, int nxt_pitch, int tpitch, int width, int height) const;

  void generateOvrHelpOutput(FILE *f) const;

public:
      const VSVideoInfo *vi;

  const VSFrameRef *GetFrame(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core);
/// implement as tivtc.IsCombed(), if it's different from tdm.IsCombed().
  //  AVSValue ConditionalIsCombedTIVTC(int n, IScriptEnvironment* env);
  TFM(VSNodeRef *_child, int _order, int _field, int _mode, int _PP, const char* _ovr, const char* _input,
    const char* _output, const char * _outputC, bool _debug, bool _display, int _slow,
    bool _mChroma, int _cNum, int _cthresh, int _MI, bool _chroma, int _blockx, int _blocky,
    int _y0, int _y1, const char* _d2v, int _ovrDefault, int _flags, double _scthresh, int _micout,
    int _micmatching, const char* _trimIn, bool _usehints, int _metric, bool _batch, bool _ubsco,
    bool _mmsco, int _opt, const VSAPI *_vsapi, VSCore *core);
  ~TFM();

//  int __stdcall SetCacheHints(int cachehints, int frame_range) override {
//    return cachehints == CACHE_GET_MTMODE ? MT_SERIALIZED : 0;
//  }
};
07070100000011000081A4000000000000000000000001671240C900001354000000000000000000000000000000000000002E00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMASM.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "TFMasm.h"
#include "emmintrin.h"

void checkSceneChangePlanar_1_SSE2(const uint8_t *prvp, const uint8_t *srcp,
  int height, int width, int prv_pitch, int src_pitch, uint64_t &diffp)
{
  __m128i sum = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 16)
    {
      __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x));
      __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      __m128i sad = _mm_sad_epu8(src1, src2);
      sum = _mm_add_epi32(sum, sad);
    }
    prvp += prv_pitch;
    srcp += src_pitch;
  }
  __m128i res = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
  diffp = _mm_cvtsi128_si32(res);
}


void checkSceneChangePlanar_2_SSE2(const uint8_t *prvp, const uint8_t *srcp,
  const uint8_t *nxtp, int height, int width, int prv_pitch, int src_pitch,
  int nxt_pitch, uint64_t &diffp, uint64_t &diffn)
{
  __m128i sump = _mm_setzero_si128();
  __m128i sumn = _mm_setzero_si128();
  while (height--) {
    for (int x = 0; x < width; x += 16)
    {
      __m128i src_prev = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x));
      __m128i src_curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      __m128i src_next = _mm_load_si128(reinterpret_cast<const __m128i *>(nxtp + x));
      __m128i sadp = _mm_sad_epu8(src_prev, src_curr);
      __m128i sadn = _mm_sad_epu8(src_next, src_curr);
      sump = _mm_add_epi32(sump, sadp);
      sumn = _mm_add_epi32(sumn, sadn);
    }
    prvp += prv_pitch;
    srcp += src_pitch;
    nxtp += nxt_pitch;
  }
  __m128i resp = _mm_add_epi32(sump, _mm_srli_si128(sump, 8));
  diffp = _mm_cvtsi128_si32(resp);
  __m128i resn = _mm_add_epi32(sumn, _mm_srli_si128(sumn, 8));
  diffn = _mm_cvtsi128_si32(resn);
}


void checkSceneChangeYUY2_1_SSE2(const uint8_t *prvp, const uint8_t *srcp,
  int height, int width, int prv_pitch, int src_pitch, uint64_t &diffp)
{
  __m128i sum = _mm_setzero_si128();
  __m128i lumaMask = _mm_set1_epi16(0x00FF);
  while (height--) {
    for (int x = 0; x < width; x += 16)
    {
      __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x));
      __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      src1 = _mm_and_si128(src1, lumaMask);
      src2 = _mm_and_si128(src2, lumaMask);
      __m128i sad = _mm_sad_epu8(src1, src2);
      sum = _mm_add_epi32(sum, sad);
    }
    prvp += prv_pitch;
    srcp += src_pitch;
  }
  __m128i res = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
  diffp = _mm_cvtsi128_si32(res);
}


void checkSceneChangeYUY2_2_SSE2(const uint8_t *prvp, const uint8_t *srcp,
  const uint8_t *nxtp, int height, int width, int prv_pitch, int src_pitch,
  int nxt_pitch, uint64_t &diffp, uint64_t &diffn)
{
  __m128i sump = _mm_setzero_si128();
  __m128i sumn = _mm_setzero_si128();
  __m128i lumaMask = _mm_set1_epi16(0x00FF);
  while (height--) {
    for (int x = 0; x < width; x += 16)
    {
      __m128i src_prev = _mm_load_si128(reinterpret_cast<const __m128i *>(prvp + x));
      __m128i src_curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      __m128i src_next = _mm_load_si128(reinterpret_cast<const __m128i *>(nxtp + x));
      src_prev = _mm_and_si128(src_prev, lumaMask);
      src_curr = _mm_and_si128(src_curr, lumaMask);
      src_next = _mm_and_si128(src_next, lumaMask);
      __m128i sadp = _mm_sad_epu8(src_prev, src_curr);
      __m128i sadn = _mm_sad_epu8(src_next, src_curr);
      sump = _mm_add_epi32(sump, sadp);
      sumn = _mm_add_epi32(sumn, sadn);
    }
    prvp += prv_pitch;
    srcp += src_pitch;
    nxtp += nxt_pitch;
  }
  __m128i resp = _mm_add_epi32(sump, _mm_srli_si128(sump, 8));
  diffp = _mm_cvtsi128_si32(resp);
  __m128i resn = _mm_add_epi32(sumn, _mm_srli_si128(sumn, 8));
  diffn = _mm_cvtsi128_si32(resn);
}

07070100000012000081A4000000000000000000000001671240C900004837000000000000000000000000000000000000002E00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMD2V.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <cstring>
#include <memory>
#include "TFM.h"

void TFM::parseD2V()
{
    std::vector<int> valIn;
  int error, D2Vformat, tff = -1, frames;
  bool found = false;
  char wfile[1024];
  error = D2V_initialize_array(valIn, D2Vformat, frames);
  if (error != 0)
  {
    if (error == 1) throw TIVTCError("TFM:  could not open specified d2v file!");
    else if (error == 2) throw TIVTCError("TFM:  d2v file is not a d2v file or is of unsupported format!");
    else if (error == 3) throw TIVTCError("TFM:  malloc failure (d2v)!");
    return;
  }
//  if (debug)
//  {
//    sprintf(buf, "TFM:  successfully opened specified d2v file.");
//    OutputDebugString(buf);
//    if (D2Vformat > 9) sprintf(buf, "TFM:  newest style (dgindex 1.2+) d2v detected.\n");
//    else if (D2Vformat > 3) sprintf(buf, "TFM:  new style (dgindex 1.0+) d2v detected.\n");
//    else if (D2Vformat > 0) sprintf(buf, "TFM:  new style (dvd2avidg 1.2+) d2v detected.\n");
//    else sprintf(buf, "TFM:  old style (dvd2avi 1.76 or 1.77) d2v detected.\n");
//    OutputDebugString(buf);
//  }
  error = D2V_find_and_correct(valIn, found, tff);
  if (error != 0 || tff == -1)
  {
    if (tff == -1) throw TIVTCError("TFM:  unknown error (no entries in d2v file?)!");
    else if (error == 1) throw TIVTCError("TFM:  illegal transition exists after fixing d2v file!");
    else if (error == 2) throw TIVTCError("TFM:  ignored rff exists after fixing d2v file!");
    return;
  }
  if (order == -1)
  {
    order = tff;
    if (field == -1) field = tff;
//    if (debug)
//    {
//      sprintf(buf, "TFM:  auto detected field order from d2v is %s.\n", order == 1 ? "TFF" : "BFF");
//      OutputDebugString(buf);
//    }
  }
  else if (order != tff)
    throw TIVTCError("TFM:  the field order of the d2v does not match the user specified field order!");
  if (!found)
  {
//    if (debug)
//    {
//      sprintf(buf, "TFM:  no errors found in d2v.\n");
//      OutputDebugString(buf);
//    }
    if (flags != 3)
    {
      if (trimIn.size())
      {
        error = fillTrimArray(frames);
        if (error == 1) throw TIVTCError("TFM:  malloc failure (trimArray)!");
        else if (error == 2) throw TIVTCError("TFM:  couldn't open trimIn file!");
        else if (error == 3) throw TIVTCError("TFM:  error parsing trimIn file. " \
          "Out of range frame numbers or invalid format!");
        else if (error == 4) throw TIVTCError("TFM:  frame count using trimIn file " \
          "doesn't match filter frame count!");
      }
      else if (frames != vi->numFrames)
      {
        char err[200] = { 0 };
        snprintf(err, 200, "TFM:  d2v frame count does not match filter frame count (%d vs %d)!",
          frames, vi->numFrames);
        throw TIVTCError(err);
      }
      error = D2V_fill_d2vfilmarray(valIn, frames);
      if (error == 2) throw TIVTCError("TFM:  malloc failure (d2vt)!");
      else if (error == 3) throw TIVTCError("TFM:  malloc failure (d2vfilmarray, 2)!");
      else if (error != 0) throw TIVTCError("TFM:  malloc failure (d2vfilmarray)!");
    }
    return;
  }
  error = D2V_get_output_filename(wfile);
  if (error != 0)
  {
    throw TIVTCError("TFM:  could not obtain output d2v filename!");
  }
  error = D2V_write_array(valIn, wfile);
  if (error != 0)
  {
    if (error == 1) throw TIVTCError("\tERROR:  could not open specified d2v file! Qutting...\n");
    else if (error == 2) throw TIVTCError("\tERROR:  could not create output d2v file! Qutting...\n");
    else if (error == 3) throw TIVTCError("\tERROR:  specified file is not a d2v file or is of unsupported format! Qutting...\n");
    return;
  }
  throw TIVTCError("TFM:  Illegal transitions found in dvd2avi project file!\n"
    "          Please use the fixed version that has been created\n"
    "          in the same directory as the original d2v file.");
}

int TFM::fillTrimArray(int frames)
{
  trimArray.resize(frames, 1);
//  if (trimArray == nullptr) return 1;
  int x, y, v;
  char linein[81];
  if (sscanf(trimIn.c_str(), "%d,%d", &x, &y) == 2)
  {
    if (x < 0 && abs(x) <= frames)
      x = frames + x;
    if (y < 0 && abs(y) <= frames)
      y = frames + y;
    if (x < 0 || x >= frames || x > y || y < 0 || y >= frames)
      return 3;
    for (v = x; v <= y; ++v)
      trimArray[v] = 0;
  }
  else
  {
    std::unique_ptr<FILE, decltype (&fclose)> f(tivtc_fopen(trimIn.c_str(), "r"), &fclose);
    if (f == nullptr) return 2;
    while (fgets(linein, 80, f.get()) != nullptr)
    {
      sscanf(linein, "%d,%d", &x, &y);
      if (x < 0 && abs(x) <= frames)
        x = frames + x;
      if (y < 0 && abs(y) <= frames)
        y = frames + y;
      if (x < 0 || x >= frames || x > y || y < 0 || y >= frames)
        return 3;
      for (v = x; v <= y; ++v)
        trimArray[v] = 0;
    }
  }
  for (v = 0, x = 0; x < frames; ++x)
  {
    if (trimArray[x]) ++v;
  }
  if (v != vi->numFrames) return 4;
  return 0;
}

int TFM::D2V_find_and_correct(std::vector<int> &array, bool &found, int &tff) const
{
  found = false;
  tff = -1;
  int count = 1, sync = 0, f1, f2, fix, temp, change;
  while (array[count] != 9)
  {
    if (tff == -1)
    {
      if (array[count - 1] < 2) tff = 0;
      else tff = 1;
    }
    fix = D2V_check_illegal(array[count - 1], array[count]);
    if (!fix)
    {
      ++count;
      continue;
    }
    found = true;
    fix = false;
    if (array[count] == 0 && array[count + 1] == 3) fix = true;
    else if (array[count] == 2 && array[count + 1] == 1) fix = true;
    if (fix)
    {
      fix = D2V_check_illegal(array[count], array[count + 2]);
      if (!fix)
      {
        temp = array[count];
        array[count] = array[count + 1];
        array[count + 1] = temp;
        continue;
      }
    }
    fix = false;
    if (array[count - 1] == 1 && array[count] == 0) fix = true;
    else if (array[count - 1] == 3 && array[count] == 2) fix = true;
    if (fix)
    {
      fix = D2V_check_illegal(array[count], array[count + 1]);
      if (fix)
      {
        temp = array[count - 1];
        array[count - 1] = array[count];
        array[count] = temp;
        continue;
      }
    }
    D2V_find_fix(array[count - 1], array[count], sync, f1, f2, change);
    sync += change;
    if (f1 != -1) array[count - 1] = f1;
    else if (f2 != -1) array[count] = f2;
  }
  return D2V_check_final(array);
}

void TFM::D2V_find_fix(int a1, int a2, int sync, int &f1, int &f2, int &change) const
{
  f1 = f2 = -1;
  if (sync >= 0)
  {
  greater_than:
    if (a1 == 0 && a2 == 3) f2 = 0;
    else if (a1 == 1 && a2 == 0) f1 = 0;
    else if (a1 == 1 && a2 == 1) f1 = 0;
    else if (a1 == 2 && a2 == 1) f2 = 2;
    else if (a1 == 3 && a2 == 2) f1 = 2;
    else if (a1 == 3 && a2 == 3) f1 = 2;
    if (f1 != f2)
    {
      change = -1;
      return;
    }
    goto less_than;
  }
  else
  {
  less_than:
    if (a1 == 0 && a2 == 2) f1 = 1;
    else if (a1 == 0 && a2 == 3) f1 = 1;
    else if (a1 == 1 && a2 == 0) f2 = 3;
    else if (a1 == 2 && a2 == 0) f1 = 3;
    else if (a1 == 2 && a2 == 1) f1 = 3;
    else if (a1 == 3 && a2 == 2) f2 = 1;
    if (f1 != f2)
    {
      change = 1;
      return;
    }
    goto greater_than;
  }
}

bool TFM::D2V_check_illegal(int a1, int a2) const
{
  if (a1 == 0 && a2 == 2) return true;
  else if (a1 == 0 && a2 == 3) return true;
  else if (a1 == 1 && a2 == 0) return true;
  else if (a1 == 1 && a2 == 1) return true;
  else if (a1 == 2 && a2 == 0) return true;
  else if (a1 == 2 && a2 == 1) return true;
  else if (a1 == 3 && a2 == 2) return true;
  else if (a1 == 3 && a2 == 3) return true;
  return false;
}

int TFM::D2V_check_final(const std::vector<int> &array) const
{
  int i = 1, top = array[0] == 3 ? 1 : 0, bot = array[0] == 1 ? 1 : 0;
  while (array[i] != 9)
  {
    if (D2V_check_illegal(array[i - 1], array[i])) return 1;
    if (top)
    {
      if (array[i] == 1) top = bot = 0;
      else if (array[i] == 3) return 2;
    }
    else if (bot)
    {
      if (array[i] == 3) top = bot = 0;
      else if (array[i] == 1) return 2;
    }
    else
    {
      if (array[i] == 3) top = 1;
      else if (array[i] == 1) bot = 1;
    }
    ++i;
  }
  return 0;
}

int TFM::D2V_initialize_array(std::vector<int> &array, int &d2vtype, int &frames) const
{
    std::unique_ptr<FILE, decltype (&fclose)> ind2v(nullptr, nullptr);
  if (array.size() != 0) { array.resize(0); }
  int num = 0, num2 = 0, pass = 1, val, D2Vformat;
  char line[1025], *p;
pass2_start:
  ind2v = decltype (ind2v)(tivtc_fopen(d2v.c_str(), "r"), &fclose);
  if (ind2v == nullptr) return 1;
  if (pass == 2)
  {
    array.resize(num + 10, 9);
  }
  fgets(line, 1024, ind2v.get());
  D2Vformat = 0;
  if (strncmp(line, "DVD2AVIProjectFile", 18) != 0)
  {
    if (strncmp(line, "DGIndexProjectFile", 18) != 0)
    {
      return 2;
    }
    sscanf(line, "DGIndexProjectFile%d", &D2Vformat);
    /* Disabled the check for newer formats
    if (D2Vformat > 14)
    {
      fclose(ind2v);
      ind2v = nullptr;
      return 2;
    }
    */
    D2Vformat += 3;
  }
  if (D2Vformat == 0) sscanf(line, "DVD2AVIProjectFile%d", &D2Vformat);
  while (fgets(line, 1024, ind2v.get()) != nullptr)
  {
    if (strncmp(line, "Location", 8) == 0) break;
  }
  fgets(line, 1024, ind2v.get());
  fgets(line, 1024, ind2v.get());
  do
  {
    p = line;
    while (*p++ != ' ');
    while (*p++ != ' ');
    if (D2Vformat > 9) while (*p++ != ' ');
    while (*p++ != ' ');
    if (D2Vformat > 0)
    {
      while (*p++ != ' ');
      while (*p++ != ' ');
      if (D2Vformat > 18)
        while (*p++ != ' ');
    }
    while (*p > 47 && *p < 123)
    {
      if (pass == 1) ++num;
      else
      {
        sscanf(p, "%x", &val);
        if (D2Vformat > 9)
        {
          if (D2Vformat > 10 && val == 0xFF) array[num2++] = 9;
          else if (D2Vformat == 10 && (val & 0x40)) array[num2++] = 9;
          else array[num2++] = (val & 0x03);
        }
        else array[num2++] = (val&~0x10);
      }
      while (*p != ' ' && *p != '\n') p++;
      p++;
    }
  } while ((fgets(line, 1024, ind2v.get()) != nullptr) && line[0] > 47 && line[0] < 123);
  if (pass == 1) { pass++; goto pass2_start; }
  d2vtype = D2Vformat;
  frames = 0;
  int i = 0;
  while (array[i] != 9)
  {
    if (array[i] & 1) frames += 3;
    else frames += 2;
    ++i;
  }
  frames >>= 1;
  return 0;
}

int TFM::D2V_write_array(const std::vector<int> &array, char wfile[]) const
{
  int num = 0, D2Vformat, val;
  char line[1025], *p, tbuf[16];
  std::unique_ptr<FILE, decltype (&fclose)> ind2v(tivtc_fopen(d2v.c_str(), "r"), &fclose);
  if (ind2v == nullptr) return 1;
  std::unique_ptr<FILE, decltype (&fclose)> outd2v(tivtc_fopen(wfile, "w"), &fclose);
  if (outd2v == nullptr) return 2;
  fgets(line, 1024, ind2v.get());
  D2Vformat = 0;
  if (strncmp(line, "DVD2AVIProjectFile", 18) != 0)
  {
    if (strncmp(line, "DGIndexProjectFile", 18) != 0)
    {
      return 3;
    }
    sscanf(line, "DGIndexProjectFile%d", &D2Vformat);
    /* Disabled the check for newer formats
    if (D2Vformat > 14)
    {
      fclose(ind2v);
      ind2v = nullptr;
      return 3;
    }
    */
    D2Vformat += 3;
  }
  if (D2Vformat == 0) sscanf(line, "DVD2AVIProjectFile%d", &D2Vformat);
  fputs(line, outd2v.get());
  while (fgets(line, 1024, ind2v.get()) != nullptr)
  {
    fputs(line, outd2v.get());
    if (strncmp(line, "Location", 8) == 0) break;
  }
  fgets(line, 1024, ind2v.get());
  fputs(line, outd2v.get());
  fgets(line, 1024, ind2v.get());
  do
  {
    p = line;
    while (*p++ != ' ');
    while (*p++ != ' ');
    if (D2Vformat > 9) while (*p++ != ' ');
    while (*p++ != ' ');
    if (D2Vformat > 0)
    {
      while (*p++ != ' ');
      while (*p++ != ' ');
      if (D2Vformat > 18)
        while (*p++ != ' ');
    }
    while (*p > 47 && *p < 123)
    {
      if (D2Vformat < 10)
      {
        while (*(p + 1) >= '0' && *(p + 1) <= '9') p++;
        *p = array[num++] + '0';
      }
      else
      {
        sscanf(p, "%x", &val);
        if (array[num] != 9)
        {
          val &= ~0x03;
          val |= array[num++];
        }
        sprintf(tbuf, "%x", val);
        *p = tbuf[0]; ++p;
        *p = tbuf[1];
      }
      while (*p != ' ' && *p != '\n') p++;
      p++;
    }
    fputs(line, outd2v.get());
  } while ((fgets(line, 1024, ind2v.get()) != nullptr) && line[0] > 47 && line[0] < 123);
  fputs(line, outd2v.get());
  while (fgets(line, 1024, ind2v.get()) != nullptr) fputs(line, outd2v.get());
  return 0;
}

int TFM::D2V_get_output_filename(char wfile[]) const
{
  FILE *outd2v = nullptr;
  strcpy(wfile, d2v.c_str());
  char *p = wfile;
  while (*p != 0) p++;
  while (*p != 46) p--;
  *p++ = '-'; *p++ = 'F'; *p++ = 'I'; *p++ = 'X'; *p++ = 'E'; *p++ = 'D';
  *p++ = '.'; *p++ = 'd'; *p++ = '2'; *p++ = 'v'; *p = 0;
  bool checking = true;
  int inT = 1;
  while (checking && inT < 100)
  {
    outd2v = tivtc_fopen(wfile, "r");
    if (outd2v != nullptr)
    {
      fclose(outd2v);
      outd2v = nullptr;
      p = wfile;
      while (*p != 0) p++;
      while (*p != 46) p--;
      if (inT == 1)
      {
        *p++ = '_'; *p++ = inT + '0'; *p++ = '.'; *p++ = 'd';
        *p++ = '2'; *p++ = 'v'; *p = 0;
      }
      else if (inT < 10)
      {
        p--;
        *p++ = inT + '0'; *p++ = '.'; *p++ = 'd';
        *p++ = '2'; *p++ = 'v'; *p = 0;
      }
      else if (inT < 100)
      {
        p--;
        if (inT > 10) p--;
        *p++ = ((inT / 10) % 10) + '0';
        *p++ = (inT % 10) + '0';
        *p++ = '.'; *p++ = 'd'; *p++ = '2'; *p++ = 'v'; *p = 0;
      }
      else return 1;
      ++inT;
    }
    else checking = false;
  }
  outd2v = tivtc_fopen(wfile, "w");
  if (outd2v == nullptr) return 2;
  fclose(outd2v);
//  remove(wfile); // What's the point of deleting it if you're just going to recreate it a few lines later?
  return 0;
}

int TFM::D2V_fill_d2vfilmarray(const std::vector<int> &array, int frames)
{
  int i = 0, v, fields = 0, val, outpattern = 0;
  if (d2vfilmarray.size()) { d2vfilmarray.resize(0); }
  d2vfilmarray.resize(frames + 1, 0);
//  if (d2vfilmarray == nullptr) return 1;
  while (array[i] != 9)
  {
    val = array[i];
    if (val & 1)
    {
      if (fields & 1)
      {
        d2vfilmarray[fields >> 1] |= 8;
        if (checkInPatternD2V(array, i))
        {
          d2vfilmarray[fields >> 1] |= 64;
          d2vfilmarray[(fields + 2) >> 1] |= 64;
        }
        else ++outpattern;
        d2vfilmarray[(fields + 2) >> 1] |= 4;
      }
      else
      {
        d2vfilmarray[fields >> 1] |= 4;
        if (checkInPatternD2V(array, i)) d2vfilmarray[fields >> 1] |= 64;
        else ++outpattern;
      }
      d2vfilmarray[(fields + 2) >> 1] |= val == 3 ? 0x3 : 0x1;
      fields += 3;
    }
    else
    {
      if (fields & 1) d2vfilmarray[fields >> 1] |= 8;
      else d2vfilmarray[fields >> 1] |= 4;
      if (checkInPatternD2V(array, i)) d2vfilmarray[fields >> 1] |= 64;
      else ++outpattern;
      fields += 2;
    }
    ++i;
  }
  if (i == 0) return 0;
  d2vpercent = double(i - outpattern)*100.0 / double(i);
//  if (debug)
//  {
//    sprintf(buf, "TFM:  d2vflags = %d  out_of_pattern = %d  (%3.1f%s FILM)\n", i, outpattern,
//      d2vpercent, "%");
//    OutputDebugString(buf);
//  }
  if (flags == 0) d2vpercent = -20.0;
  if (trimIn.size() && trimArray.size())
  {
    std::vector<uint8_t> d2vt(vi->numFrames, 0);
//    if (d2vt == nullptr) return 2;
    for (v = 0, i = 0; i <= nfrms && v < frames; ++v)
    {
      if (trimArray[v])
      {
        if (v == 0 || trimArray[v - 1]) d2vt[i++] = d2vfilmarray[v];
        else ++i;
      }
    }
    d2vfilmarray.resize(0);
    d2vfilmarray.resize(vi->numFrames);
//    if (d2vfilmarray == nullptr)
//    {
//      free(d2vt);
//      return 3;
//    }
    memcpy(d2vfilmarray.data(), d2vt.data(), vi->numFrames * sizeof(unsigned char));
    trimArray.resize(0);
  }
  return 0;
}

bool TFM::checkInPatternD2V(const std::vector<int> &array, int i) const
{
  if (array[i + 1] == 9
    && i > 3 && checkD2VCase((array[i - 4] << 12) + (array[i - 3] << 8) + (array[i - 2] << 4) + array[i - 1])
    && checkD2VCase((array[i - 3] << 12) + (array[i - 2] << 8) + (array[i - 1] << 4) + array[i + 0]))
    return true;
  if (array[i + 2] == 9 && array[i + 1] != 9
    && i > 2 && checkD2VCase((array[i - 3] << 12) + (array[i - 2] << 8) + (array[i - 1] << 4) + array[i + 0])
    && checkD2VCase((array[i - 2] << 12) + (array[i - 1] << 8) + (array[i - 0] << 4) + array[i + 1]))
    return true;
  if (i >= 2 && checkD2VCase((array[i - 2] << 12) + (array[i - 1] << 8) + (array[i - 0] << 4) + array[i + 1])
    && checkD2VCase((array[i - 1] << 12) + (array[i - 0] << 8) + (array[i + 1] << 4) + array[i + 2]))
    return true;
  if (i == 1 && checkD2VCase((array[i - 1] << 12) + (array[i - 0] << 8) + (array[i + 1] << 4) + array[i + 2])
    && checkD2VCase((array[i + 0] << 12) + (array[i + 1] << 8) + (array[i + 2] << 4) + array[i + 3]))
    return true;
  if (i == 0 && checkD2VCase((array[i + 0] << 12) + (array[i + 1] << 8) + (array[i + 2] << 4) + array[i + 3])
    && checkD2VCase((array[i + 1] << 12) + (array[i + 2] << 8) + (array[i + 3] << 4) + array[i + 4]))
    return true;
  return false;
}

bool TFM::checkD2VCase(int check) const
{
  switch (check)
  {
  case 0x123:
  case 0x1230:
  case 0x2301:
  case 0x3012:
    return true;
  default:
    return false;
  }
  return false;
}
07070100000013000081A4000000000000000000000001671240C900012276000000000000000000000000000000000000002D00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMPP.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <memory>
#include "TFM.h"
#include "TFMPP.h"
#include "TCommonASM.h"
#include "emmintrin.h"
#include "smmintrin.h"


const VSFrameRef *TFMPP::GetFrame(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core)
{
  if (n < 0) n = 0;
  else if (n > nfrms) n = nfrms;

  if (activationReason == arInitial) {
      if (PP > 4)
          vsapi->requestFrameFilter(std::max(0, n - 1), child, frameCtx);

      if (uC2)
          vsapi->requestFrameFilter(n, clip2, frameCtx);

      vsapi->requestFrameFilter(n, child, frameCtx);

      if (PP > 4)
          vsapi->requestFrameFilter(std::min(n + 1, nfrms), child, frameCtx);

      return nullptr;
  } else if (activationReason != arAllFramesReady) {
      return nullptr;
  }

  bool combed;
  int fieldSrc, field;
  const VSFrameRef *src = vsapi->getFrameFilter(n, child, frameCtx);
  getProperties(src, fieldSrc, combed);
  if (!combed)
  {
    return src;
  }
  getSetOvr(n);
  VSFrameRef *dst;
  if (PP > 4)
  {
    int use = 0;

    const VSFrameRef *prv = vsapi->getFrameFilter(std::max(0, n - 1), child, frameCtx);
    getProperties(prv, field, combed);
    if (!combed && field != -1 && n != 0) ++use;
    const VSFrameRef *nxt = vsapi->getFrameFilter(std::min(n + 1, nfrms), child, frameCtx);
    getProperties(nxt, field, combed);
    if (!combed && field != -1 && n != nfrms) use += 2;
    if (use > 0)
    {
      dst = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src, core);
      buildMotionMask(prv, src, nxt, mmask, use);
      if (uC2) {
        const VSFrameRef *frame = vsapi->getFrameFilter(n, clip2, frameCtx);
        maskClip2(src, frame, mmask, dst);
        vsapi->freeFrame(frame);
      }
      else
      {
        if (PP == 5)
          BlendDeint(src, mmask, dst, false);
        else
        {
          if (PP == 6)
          {
            copyField(dst, src, fieldSrc);
            CubicDeint(src, mmask, dst, false, fieldSrc);
          }
          else
          {
            copyFrame(dst, src, vsapi);
            elaDeint(dst, mmask, src, false, fieldSrc);
          }
        }
      }
    }
    else
    {
      if (uC2)
      {
          const VSFrameRef *frame = vsapi->getFrameFilter(n, clip2, frameCtx);
        dst = vsapi->copyFrame(frame, core);
        vsapi->freeFrame(frame);
      }
      else
      {
        dst = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src, core);
        if (PP == 5) 
          BlendDeint(src, mmask, dst, true);
        else
        {
          if (PP == 6)
          {
            copyField(dst, src, fieldSrc);
            CubicDeint(src, mmask, dst, true, fieldSrc);
          }
          else
          {
            copyFrame(dst, src, vsapi);
            elaDeint(dst, mmask, src, true, fieldSrc);
          }
        }
      }
    }
    vsapi->freeFrame(prv);
    vsapi->freeFrame(nxt);
  }
  else
  {
    // PP <= 4
    if (uC2)
    {
        const VSFrameRef *frame = vsapi->getFrameFilter(n, clip2, frameCtx);
      dst = vsapi->copyFrame(frame, core);
      vsapi->freeFrame(frame);
    }
    else
    {
      dst = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src, core);
      if (PP == 2)
        BlendDeint(src, mmask, dst, true);
      else
      {
        if (PP == 3)
        {
          copyField(dst, src, fieldSrc);
          CubicDeint(src, mmask, dst, true, fieldSrc);
        }
        else
        {
          copyFrame(dst, src, vsapi);
          elaDeint(dst, mmask, src, true, fieldSrc);
        }
      }
    }
  }
  vsapi->freeFrame(src);
  if (display) writeDisplay(dst, n, fieldSrc);
  return dst;
}

void TFMPP::buildMotionMask(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
  VSFrameRef *mask, int use) const
{
  if (vi->format->bytesPerSample == 1)
    buildMotionMask_core<uint8_t>(prv, src, nxt, mask, use);
  else
    buildMotionMask_core<uint16_t>(prv, src, nxt, mask, use);
}

template<typename pixel_t>
void TFMPP::buildMotionMask_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
  VSFrameRef* mask, int use) const
{
  bool use_sse2 = cpuFlags.sse2;

  const int np = vi->format->numPlanes;
  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    const pixel_t *prvpp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(prv, plane));
    const int prv_pitch = vsapi->getStride(prv, plane) / sizeof(pixel_t);
    const pixel_t*prvp = prvpp + prv_pitch;
    const pixel_t*prvpn = prvp + prv_pitch;

    const pixel_t *srcpp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);
    
    const int width = vsapi->getFrameWidth(src, plane);
    const int height = vsapi->getFrameHeight(src, plane);

    const pixel_t *srcp = srcpp + src_pitch;
    const pixel_t *srcpn = srcp + src_pitch;

    const pixel_t *nxtpp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(nxt, plane));
    const int nxt_pitch = vsapi->getStride(nxt, plane) / sizeof(pixel_t);
    const pixel_t *nxtp = nxtpp + nxt_pitch;
    const pixel_t *nxtpn = nxtp + nxt_pitch;

    uint8_t *maskw = vsapi->getWritePtr(mask, b);
    const int msk_pitch = vsapi->getStride(mask, b);

    maskw += msk_pitch;
    
    const int mthresh_scaled = mthresh << (vi->format->bitsPerSample - 8);

    if (use == 1)
    {
      // fixme: hbd SIMD
      if (sizeof(pixel_t) == 1 && use_sse2)
        buildMotionMask1_SSE2((const uint8_t *)srcp, (const uint8_t*)prvp, maskw, src_pitch, prv_pitch, msk_pitch, width, height - 2, &cpuFlags);
      else
      {
        memset(maskw - msk_pitch, 0xFF, msk_pitch*height);
        for (int y = 1; y < height - 1; ++y)
        {
          for (int x = 0; x < width; ++x)
          {
            if (!(abs(prvpp[x] - srcpp[x]) > mthresh_scaled || abs(prvp[x] - srcp[x]) > mthresh_scaled ||
              abs(prvpn[x] - srcpn[x]) > mthresh_scaled)) maskw[x] = 0;
          }
          prvpp += prv_pitch;
          prvp += prv_pitch;
          prvpn += prv_pitch;
          srcpp += src_pitch;
          srcp += src_pitch;
          srcpn += src_pitch;
          maskw += msk_pitch;
        }
      }
    }
    else if (use == 2)
    {
      // fixme: hbd SIMD
      if (sizeof(pixel_t) == 1 && use_sse2)
        buildMotionMask1_SSE2((const uint8_t*)srcp, (const uint8_t*)nxtp, maskw, src_pitch, nxt_pitch, msk_pitch, width, height - 2, &cpuFlags);
      else
      {
        memset(maskw - msk_pitch, 0xFF, msk_pitch*height);
        for (int y = 1; y < height - 1; ++y)
        {
          for (int x = 0; x < width; ++x)
          {
            if (!(abs(nxtpp[x] - srcpp[x]) > mthresh_scaled || abs(nxtp[x] - srcp[x]) > mthresh_scaled ||
              abs(nxtpn[x] - srcpn[x]) > mthresh_scaled)) maskw[x] = 0;
          }
          srcpp += src_pitch;
          srcp += src_pitch;
          srcpn += src_pitch;
          nxtpp += nxt_pitch;
          nxtp += nxt_pitch;
          nxtpn += nxt_pitch;
          maskw += msk_pitch;
        }
      }
    }
    else
    {
      // fixme: hbd SIMD
      // use not 1 or 2
      if (sizeof(pixel_t) == 1 && use_sse2)
      {
        buildMotionMask2_SSE2((const uint8_t*)prvp, (const uint8_t*)srcp, (const uint8_t*)nxtp, maskw, prv_pitch, src_pitch, nxt_pitch, msk_pitch, width, height - 2, &cpuFlags);
        for (int y = 1; y < height; ++y)
        {
          for (int x = 0; x < width; ++x)
          {
            if (!maskw[x]) continue;
            if (((maskw[x] & 0x8) && (maskw[x] & 0x15)) ||
              ((maskw[x] & 0x4) && (maskw[x] & 0x2A)) ||
              ((maskw[x] & 0x22) && ((maskw[x] & 0x11) == 0x11)) ||
              ((maskw[x] & 0x11) && ((maskw[x] & 0x22) == 0x22)))
              maskw[x] = 0xFF;
            else maskw[x] = 0;
          }
          maskw += msk_pitch;
        }
      }
      else
      {
        memset(maskw - msk_pitch, 0xFF, msk_pitch*height);
        for (int y = 1; y < height - 1; ++y)
        {
          for (int x = 0; x < width; ++x)
          {
            if (!(((abs(prvp[x] - srcp[x]) > mthresh_scaled) && (abs(nxtpp[x] - srcpp[x]) > mthresh_scaled ||
              abs(nxtp[x] - srcp[x]) > mthresh_scaled || abs(nxtpn[x] - srcpn[x]) > mthresh_scaled)) ||
              ((abs(nxtp[x] - srcp[x]) > mthresh_scaled) && (abs(prvpp[x] - srcpp[x]) > mthresh_scaled ||
                abs(prvp[x] - srcp[x]) > mthresh_scaled || abs(prvpn[x] - srcpn[x]) > mthresh_scaled)) ||
                (abs(prvpp[x] - srcpp[x]) > mthresh_scaled && abs(prvpn[x] - srcpn[x]) > mthresh_scaled &&
              (abs(nxtpp[x] - srcpp[x]) > mthresh_scaled || abs(nxtpn[x] - srcpn[x]) > mthresh_scaled)) ||
                  ((abs(prvpp[x] - srcpp[x]) > mthresh_scaled || abs(prvpn[x] - srcpn[x]) > mthresh_scaled) &&
                    abs(nxtpp[x] - srcpp[x]) > mthresh_scaled && abs(nxtpn[x] - srcpn[x]) > mthresh_scaled)))
              maskw[x] = 0;
          }
          prvpp += prv_pitch;
          prvp += prv_pitch;
          prvpn += prv_pitch;
          srcpp += src_pitch;
          srcp += src_pitch;
          srcpn += src_pitch;
          nxtpp += nxt_pitch;
          nxtp += nxt_pitch;
          nxtpn += nxt_pitch;
          maskw += msk_pitch;
        }
      }
    }
  }

    denoisePlanar(mask);
    if (vi->format->subSamplingW == 1 && vi->format->subSamplingH == 1)
      linkPlanar<420>(mask);
    else if (vi->format->subSamplingW == 1 && vi->format->subSamplingH == 0)
      linkPlanar<422>(mask);
    else if (vi->format->subSamplingW == 0 && vi->format->subSamplingH == 0)
      linkPlanar<444>(mask);
    else if (vi->format->subSamplingW == 2 && vi->format->subSamplingH == 0)
      linkPlanar<411>(mask);
}

void TFMPP::buildMotionMask1_SSE2(const uint8_t *srcp1, const uint8_t *srcp2,
  uint8_t *dstp, int s1_pitch, int s2_pitch, int dst_pitch, int width,
  int height, const CPUFeatures *cpu) const
{
    (void)cpu;

  memset(dstp - dst_pitch, 0xFF, dst_pitch);
  memset(dstp + dst_pitch*height, 0xFF, dst_pitch);
  __m128i thresh = _mm_set1_epi8((char)(std::max(std::min(255 - mthresh - 1, 255), 0)));
  __m128i full_ff = _mm_set1_epi8(-1);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto next1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 + s1_pitch + x));
      auto next2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 + s2_pitch + x));
      auto diff_next12 = _mm_subs_epu8(next1, next2);
      auto diff_next21 = _mm_subs_epu8(next2, next1);
      auto abs_diff_next = _mm_or_si128(diff_next12, diff_next21); // xmm0

      auto curr1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 + x));
      auto curr2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 + x));
      auto diff_curr12 = _mm_subs_epu8(curr1, curr2);
      auto diff_curr21 = _mm_subs_epu8(curr2, curr1);
      auto abs_diff_curr = _mm_or_si128(diff_curr12, diff_curr21); // xmm2

      auto prev1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 - s1_pitch + x));
      auto prev2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 - s2_pitch + x));
      auto diff_prev12 = _mm_subs_epu8(prev1, prev2);
      auto diff_prev21 = _mm_subs_epu8(prev2, prev1);
      auto abs_diff_prev = _mm_or_si128(diff_prev12, diff_prev21); // xmm1

      auto cmp_prev = _mm_cmpeq_epi8(_mm_adds_epu8(abs_diff_prev, thresh), full_ff);
      auto cmp_curr = _mm_cmpeq_epi8(_mm_adds_epu8(abs_diff_curr, thresh), full_ff);
      auto cmp_next = _mm_cmpeq_epi8(_mm_adds_epu8(abs_diff_next, thresh), full_ff);
      auto cmp = _mm_or_si128(_mm_or_si128(cmp_prev, cmp_curr), cmp_next);
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), cmp);
    }
    srcp1 += s1_pitch;
    srcp2 += s2_pitch;
    dstp += dst_pitch;
  }
}


void TFMPP::buildMotionMask2_SSE2(const uint8_t *srcp1, const uint8_t *srcp2,
  const uint8_t *srcp3, uint8_t *dstp, int s1_pitch, int s2_pitch,
  int s3_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpu) const
{
    (void)cpu;

  __m128i thresh = _mm_set1_epi8((char)(std::max(std::min(255 - mthresh - 1, 255), 0)));
  __m128i all_ff = _mm_set1_epi8(-1);
  __m128i onesByte = _mm_set1_epi8(0x01);
  __m128i twosByte = _mm_set1_epi8(0x02);
  __m128i foursByte = _mm_set1_epi8(0x04);
  __m128i eightsByte = _mm_set1_epi8(0x08);
  __m128i sixteensByte = _mm_set1_epi8(0x10);
  __m128i thirtytwosByte = _mm_set1_epi8(0x20);
  memset(dstp - dst_pitch, 0xFF, dst_pitch);
  memset(dstp + dst_pitch*height, 0xFF, dst_pitch);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto next1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 + s1_pitch + x)); // prv?
      auto next2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 + s2_pitch + x)); // src?
      auto next3 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp3 + s3_pitch + x)); // nxt?

      auto absdiff12 = _mm_or_si128(_mm_subs_epu8(next1, next2), _mm_subs_epu8(next2, next1));
      auto absdiff23 = _mm_or_si128(_mm_subs_epu8(next2, next3), _mm_subs_epu8(next3, next2));
      auto cmp12 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff12, thresh), all_ff);
      auto cmp23 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff23, thresh), all_ff);
      auto masked_by_01_02 = _mm_or_si128(_mm_and_si128(cmp12, onesByte), _mm_and_si128(cmp23, twosByte));

      auto curr1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 + x)); // prv?
      auto curr2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 + x)); // src?
      auto curr3 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp3 + x)); // nxt?

      absdiff12 = _mm_or_si128(_mm_subs_epu8(curr1, curr2), _mm_subs_epu8(curr2, curr1));
      absdiff23 = _mm_or_si128(_mm_subs_epu8(curr2, curr3), _mm_subs_epu8(curr3, curr2));
      cmp12 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff12, thresh), all_ff);
      cmp23 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff23, thresh), all_ff);
      auto masked_by_04_08 = _mm_or_si128(_mm_and_si128(cmp12, foursByte), _mm_and_si128(cmp23, eightsByte));
      
      auto masked_by_01_02_04_08 = _mm_or_si128(masked_by_01_02, masked_by_04_08);

      auto prev1 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp1 - s1_pitch + x)); // prv?
      auto prev2 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp2 - s2_pitch + x)); // src?
      auto prev3 = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp3 - s3_pitch + x)); // nxt?

      absdiff12 = _mm_or_si128(_mm_subs_epu8(prev1, prev2), _mm_subs_epu8(prev2, prev1));
      absdiff23 = _mm_or_si128(_mm_subs_epu8(prev2, prev3), _mm_subs_epu8(prev3, prev2));
      cmp12 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff12, thresh), all_ff);
      cmp23 = _mm_cmpeq_epi8(_mm_adds_epu8(absdiff23, thresh), all_ff);
      auto masked_by_10_20 = _mm_or_si128(_mm_and_si128(cmp12, sixteensByte), _mm_and_si128(cmp23, thirtytwosByte));

      auto masked_by_01_02_04_08_10_20 = _mm_or_si128(masked_by_01_02_04_08, masked_by_10_20);

      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), masked_by_01_02_04_08_10_20);

    }
    srcp1 += s1_pitch;
    srcp2 += s2_pitch;
    srcp3 += s3_pitch;
    dstp += dst_pitch;
  }
}

// not the same as in TDeint. Here 0xFF instead of 0x3C
//void TFMPP::denoiseYUY2(const VSFrameRef *mask)
//{
//  uint8_t *maskw = mask->GetPtr();
//  const int mask_pitch = mask->GetPitch();
//  const int Height = mask->GetHeight();
//  const int Width = mask->GetWidth();
//  uint8_t *maskwp = maskw - mask_pitch;
//  uint8_t *maskwn = maskw + mask_pitch;
//  for (int y = 1; y < Height - 1; ++y)
//  {
//    maskwp += mask_pitch;
//    maskw += mask_pitch;
//    maskwn += mask_pitch;
//    for (int x = 4; x < Width - 4; ++x)
//    {
//      if (maskw[x] == 0xFF)
//      {
//        if (maskwp[x - 2] == 0xFF) goto check_chroma;
//        if (maskwp[x] == 0xFF) goto check_chroma;
//        if (maskwp[x + 2] == 0xFF) goto check_chroma;
//        if (maskw[x - 2] == 0xFF) goto check_chroma;
//        if (maskw[x + 2] == 0xFF) goto check_chroma;
//        if (maskwn[x - 2] == 0xFF) goto check_chroma;
//        if (maskwn[x] == 0xFF) goto check_chroma;
//        if (maskwn[x + 2] == 0xFF) goto check_chroma;
//        maskw[x] = 0;
//      }
//    check_chroma:
//      ++x;
//      if (maskw[x] == 0xFF)
//      {
//        if (maskwp[x - 4] == 0xFF) continue;
//        if (maskwp[x] == 0xFF) continue;
//        if (maskwp[x + 4] == 0xFF) continue;
//        if (maskw[x - 4] == 0xFF) continue;
//        if (maskw[x + 4] == 0xFF) continue;
//        if (maskwn[x - 4] == 0xFF) continue;
//        if (maskwn[x] == 0xFF) continue;
//        if (maskwn[x + 4] == 0xFF) continue;
//        maskw[x] = 0;
//      }
//    }
//  }
//}

//void TFMPP::linkYUY2(const VSFrameRef *mask)
//{
//  uint8_t *maskw = mask->GetPtr();
//  const int mask_pitch = mask->GetPitch();
//  const int Height = mask->GetHeight();
//  const int Width = mask->GetWidth() >> 2;
//  for (int y = 1; y < Height - 1; ++y)
//  {
//    maskw += mask_pitch;
//    for (int x = 0; x < Width; ++x)
//    {
//      if ((((unsigned int*)maskw)[x] & 0x00FF00FF) == 0x00FF00FF)
//      {
//        ((unsigned int*)maskw)[x] = 0xFFFFFFFF;
//      }
//    }
//  }
//}

// mask-only no need HBD here
// Differences
// TFMPP::denoisePlanar: const VSFrameRef, 0xFF, TDeinterlace:PVideoFrame 0x3C
void TFMPP::denoisePlanar(VSFrameRef *mask) const
{
  const int np = vsapi->getFrameFormat(mask)->numPlanes;
  for (int b = 0; b < np; ++b)
  {
    uint8_t *maskpp = vsapi->getWritePtr(mask, b);
    const int msk_pitch = vsapi->getStride(mask, b);
    uint8_t *maskp = maskpp + msk_pitch;
    uint8_t *maskpn = maskp + msk_pitch;
    const int Height = vsapi->getFrameHeight(mask, b);
    const int Width = vsapi->getFrameWidth(mask, b);
    for (int y = 1; y < Height - 1; ++y)
    {
      for (int x = 1; x < Width - 1; ++x)
      {
        if (maskp[x] == 0xFF)
        {
          if (maskpp[x - 1] == 0xFF) continue;
          if (maskpp[x] == 0xFF) continue;
          if (maskpp[x + 1] == 0xFF) continue;
          if (maskp[x - 1] == 0xFF) continue;
          if (maskp[x + 1] == 0xFF) continue;
          if (maskpn[x - 1] == 0xFF) continue;
          if (maskpn[x] == 0xFF) continue;
          if (maskpn[x + 1] == 0xFF) continue;
          maskp[x] = 0;
        }
      }
      maskpp += msk_pitch;
      maskp += msk_pitch;
      maskpn += msk_pitch;
    }
  }
}

template<int planarType>
void TFMPP::linkPlanar(VSFrameRef* mask) const
{
  uint8_t* maskpY = vsapi->getWritePtr(mask, 0);
  uint8_t* maskpV = vsapi->getWritePtr(mask, 1);
  uint8_t* maskpU = vsapi->getWritePtr(mask, 2);
  const int mask_pitchY = vsapi->getStride(mask, 0);
  const int mask_pitchUV = vsapi->getStride(mask, 2);
  const int HeightUV = vsapi->getFrameHeight(mask, 2);
  const int WidthUV = vsapi->getFrameWidth(mask, 2);

  if constexpr (planarType == 420) 
  {
    uint8_t* maskppY = maskpY - mask_pitchY; // prev Y use at 420
    uint8_t* maskpnY = maskpY + mask_pitchY; // next Y
    uint8_t* maskpnnY = maskpY + 2 * mask_pitchY; // nextnextY used at 420
    for (int y = 1; y < HeightUV - 1; ++y)
    {
      maskppY = maskpnY; // prev = next
      maskpY = maskpnnY; // current = nextnext
      maskpnY += mask_pitchY * 2; // YV12 vertical subsampling
      maskpnnY += mask_pitchY * 2;
      maskpV += mask_pitchUV;
      maskpU += mask_pitchUV;
      for (int x = 0; x < WidthUV; ++x)
      {
        if ((((unsigned short*)maskpY)[x] == (unsigned short)0xFFFF) &&
          (((unsigned short*)maskpnY)[x] == (unsigned short)0xFFFF) &&
          (((y & 1) && (((unsigned short*)maskppY)[x] == (unsigned short)0xFFFF)) ||
            (!(y & 1) && (((unsigned short*)maskpnnY)[x] == (unsigned short)0xFFFF))))
        {
          maskpV[x] = maskpU[x] = 0xFF;
        }
      }
    }
  }
  else { // 422 444 411
    for (int y = 1; y < HeightUV - 1; ++y)
    {
      maskpY += mask_pitchY;
      maskpV += mask_pitchUV;
      maskpU += mask_pitchUV;
      for (int x = 0; x < WidthUV; ++x)
      {
        if constexpr (planarType == 422) {
          if (((unsigned short*)maskpY)[x] == (unsigned short)0xFFFF) // horizontal subsampling
          {
            maskpV[x] = maskpU[x] = 0xFF;
          }
        }
        else if constexpr (planarType == 444) {
          if (maskpY[x] == 0xFF)
          {
            maskpV[x] = maskpU[x] = 0xFF;
          }
        }
        else if constexpr (planarType == 411) {
          if (((uint32_t*)maskpY)[x] == (uint32_t)0xFFFFFFFF) // horizontal subsampling
          {
            maskpV[x] = maskpU[x] = 0xFF;
          }
        }
      }
    }
  }
}

void TFMPP::BlendDeint(const VSFrameRef *src, const VSFrameRef* mask, VSFrameRef *dst,
  bool nomask) const
{
  if (vi->format->bitsPerSample == 8)
    BlendDeint_core<uint8_t>(src, mask, dst, nomask);
  else
    BlendDeint_core<uint16_t>(src, mask, dst, nomask);
}

template<typename pixel_t>
void TFMPP::BlendDeint_core(const VSFrameRef *src, const VSFrameRef* mask, VSFrameRef *dst,
  bool nomask) const
{
  bool use_sse2 = cpuFlags.sse2;

  const int np = vi->format->numPlanes;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    const pixel_t *srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);

    const int width = vsapi->getFrameWidth(src, plane);
    const int height = vsapi->getFrameHeight(src, plane);

    const pixel_t* srcpp = srcp - src_pitch;
    const pixel_t* srcpn = srcp + src_pitch;

    pixel_t *dstp = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, plane));
    const int dst_pitch = vsapi->getStride(dst, plane) / sizeof(pixel_t);

    const uint8_t *maskp = vsapi->getReadPtr(mask, b);
    const int msk_pitch = vsapi->getStride(mask, b);
    
    // top line
    for (int x = 0; x < width; ++x)
      dstp[x] = (srcp[x] + srcpn[x] + 1) >> 1;
    srcpp += src_pitch;
    srcp += src_pitch;
    srcpn += src_pitch;
    dstp += dst_pitch;
    maskp += msk_pitch;
    const int lines_to_process = height - 2;
    if (nomask)
    {
      // fixme: hbd SIMD
      if (sizeof(pixel_t) == 1 && use_sse2)
        blendDeintMask_SSE2<false>((const uint8_t *)srcp, (uint8_t*)dstp, nullptr, src_pitch, dst_pitch, 0, width, lines_to_process);
      else
        blendDeintMask_C<pixel_t, false>(srcp, dstp, nullptr, src_pitch, dst_pitch, 0, width, lines_to_process);
    }
    else
    {
      // with mask
      if (sizeof(pixel_t) == 1 && use_sse2)
        blendDeintMask_SSE2<true>((const uint8_t*)srcp, (uint8_t*)dstp, maskp, src_pitch, dst_pitch, msk_pitch, width, lines_to_process);
      else
        blendDeintMask_C<pixel_t, true>(srcp, dstp, maskp, src_pitch, dst_pitch, msk_pitch, width, lines_to_process);
    }
    srcpp += src_pitch * lines_to_process;
    srcp += src_pitch * lines_to_process;
    // srcpn += src_pitch * lines_to_process; // no forther use
    // maskp += msk_pitch * lines_to_process; // no further use
    dstp += dst_pitch * lines_to_process;
    // bottom line
    for (int x = 0; x < width; ++x)
      dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
  }
}


static AVS_FORCEINLINE __m128i _MM_BLENDV_EPI8(__m128i const& a, __m128i const& b, __m128i const& mask) {
  //return  _mm_blendv_epi8 (a, b, mask);
  auto andop = _mm_and_si128(mask, b);
  auto andnop = _mm_andnot_si128(mask, a);
  return _mm_or_si128(andop, andnop);
}

template<bool with_mask>
void blendDeintMask_SSE2(const uint8_t *srcp, uint8_t *dstp,
  const uint8_t *maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height)
{
  auto zero = _mm_setzero_si128();
  auto twosWord = _mm_set1_epi16(2);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto next = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch + x));
      auto prev_lo = _mm_unpacklo_epi8(prev, zero);
      auto curr_lo = _mm_unpacklo_epi8(curr, zero);
      auto next_lo = _mm_unpacklo_epi8(next, zero);
      auto prev_hi = _mm_unpackhi_epi8(prev, zero);
      auto curr_hi = _mm_unpackhi_epi8(curr, zero);
      auto next_hi = _mm_unpackhi_epi8(next, zero);
      auto curr_lo_mul2 = _mm_slli_epi16(curr_lo, 1);
      auto curr_hi_mul2 = _mm_slli_epi16(curr_hi, 1);
      auto sum_lo = _mm_add_epi16(prev_lo, _mm_add_epi16(curr_lo_mul2, next_lo));
      auto sum_hi = _mm_add_epi16(prev_hi, _mm_add_epi16(curr_hi_mul2, next_hi));
      auto res_lo = _mm_srli_epi16(_mm_add_epi16(sum_lo, twosWord), 2); // (p + c*2 + n + 2) >> 2
      auto res_hi = _mm_srli_epi16(_mm_add_epi16(sum_hi, twosWord), 2); // (p + c*2 + n + 2) >> 2
      auto res = _mm_packus_epi16(res_lo, res_hi);

      if constexpr (with_mask) {
        auto mask = _mm_load_si128(reinterpret_cast<const __m128i*>(maskp + x));
        res = _MM_BLENDV_EPI8(curr, res, mask); // if mask then res else curr
      } else {
          (void)maskp;
          (void)msk_pitch;
      }
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
    if constexpr(with_mask)
      maskp += msk_pitch;
  }
}

template<typename pixel_t, bool with_mask>
void blendDeintMask_C(const pixel_t* srcp, pixel_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height)
{
  while (height--) {
    const pixel_t* srcpp = srcp - src_pitch;
    const pixel_t* srcpn = srcp + src_pitch;
    for (int x = 0; x < width; ++x)
    {
      if (!with_mask || (with_mask && maskp[x] == 0xFF))
        dstp[x] = (srcpp[x] + (srcp[x] << 1) + srcpn[x] + 2) >> 2;
      else
        dstp[x] = srcp[x];
    }
    srcp += src_pitch;
    dstp += dst_pitch;
    if constexpr (with_mask)
      maskp += msk_pitch;
  }
}

void TFMPP::CubicDeint(const VSFrameRef *src, const VSFrameRef *mask, VSFrameRef *dst, bool nomask,
  int field) const
{
    switch (vi->format->bitsPerSample) {
    case 8: CubicDeint_core<uint8_t, 8>(src, mask, dst, nomask, field); break;
    case 10: CubicDeint_core<uint16_t, 10>(src, mask, dst, nomask, field); break;
    case 12: CubicDeint_core<uint16_t, 12>(src, mask, dst, nomask, field); break;
    case 14: CubicDeint_core<uint16_t, 14>(src, mask, dst, nomask, field); break;
    case 16: CubicDeint_core<uint16_t, 16>(src, mask, dst, nomask, field); break;
    }
}

template<typename pixel_t, int bits_per_pixel>
void TFMPP::CubicDeint_core(const VSFrameRef *src, const VSFrameRef* mask, VSFrameRef *dst, bool nomask,
  int field) const
{
  bool use_sse2 = cpuFlags.sse2;

  const int np = vi->format->numPlanes;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;

    const pixel_t *srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
    // !! yes, double;
    const int src_pitch = vsapi->getStride(src, plane) * 2 / sizeof(pixel_t);

    const int width = vsapi->getFrameWidth(src, plane);
    const int rowsize = width * sizeof(pixel_t);
    const int height = vsapi->getFrameHeight(src, plane);

    pixel_t *dstp = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, plane));
    const int dst_pitch = (vsapi->getStride(dst, plane) << 1) / sizeof(pixel_t);

    const uint8_t *maskp = vsapi->getReadPtr(mask, b);
    const int msk_pitch = vsapi->getStride(mask, b) << 1;
    
    srcp += (src_pitch >> 1)*(3 - field);
    dstp += (dst_pitch >> 1)*(2 - field);
    maskp += (msk_pitch >> 1)*(2 - field);

    const pixel_t *srcpp = srcp - src_pitch;
    const pixel_t *srcppp = srcpp - src_pitch;
    const pixel_t* srcpn = srcp + src_pitch;
    const pixel_t*srcr = srcp - (src_pitch >> 1);

    // top orphan
    if (field == 0)
      vs_bitblt(vsapi->getWritePtr(dst, plane), (dst_pitch >> 1) * sizeof(pixel_t),
        vsapi->getReadPtr(src, plane) + (src_pitch >> 1) * sizeof(pixel_t), (src_pitch >> 1) * sizeof(pixel_t), rowsize, 1);
    
    if (nomask)
    {
      // top
      for (int x = 0; x < width; ++x)
        dstp[x] = (srcp[x] + srcpp[x] + 1) >> 1;
      srcppp += src_pitch;
      srcpp += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      dstp += dst_pitch;
      // middle
      const int lines_to_process = height / 2 - 3;
      if (bits_per_pixel == 8 && use_sse2)
      {
        // false: no mask
        cubicDeintMask_SSE2<false>((const uint8_t *)srcp, (uint8_t*)dstp, nullptr, src_pitch, dst_pitch, 0, width, lines_to_process);
      }
      else
      {
        cubicDeintMask_C<pixel_t, bits_per_pixel, false>(srcp, dstp, nullptr, src_pitch, dst_pitch, 0, width, lines_to_process);
      }
      srcppp += src_pitch * lines_to_process;
      srcpp += src_pitch * lines_to_process;
      srcp += src_pitch * lines_to_process;
      srcpn += src_pitch * lines_to_process;
      dstp += dst_pitch * lines_to_process;
      // bottom
      for (int x = 0; x < width; ++x)
        dstp[x] = (srcp[x] + srcpp[x] + 1) >> 1;
    }
    else
    {
      // with mask
      // top
      for (int x = 0; x < width; ++x)
      {
        if (maskp[x] == 0xFF)
          dstp[x] = (srcp[x] + srcpp[x] + 1) >> 1;
        else dstp[x] = srcr[x];
      }
      srcppp += src_pitch;
      srcpp += src_pitch;
      srcr += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      maskp += msk_pitch;
      dstp += dst_pitch;
      // middle
      const int lines_to_process = height / 2 - 3;
      if (bits_per_pixel == 8 && use_sse2)
      {
        // fixme: hbd SIMD sse2 for 10+ bits
        // true: with_mask
        cubicDeintMask_SSE2<true>((const uint8_t*)srcp, (uint8_t*)dstp, maskp, src_pitch, dst_pitch, msk_pitch, width, lines_to_process);
      }
      else
      {
        //for (int y = 4 - field; y < height - 3; y += 2)
        cubicDeintMask_C<pixel_t, bits_per_pixel, true>(srcp, dstp, maskp, src_pitch, dst_pitch, msk_pitch, width, lines_to_process);
      }
      srcppp += src_pitch * lines_to_process;
      srcpp += src_pitch * lines_to_process;
      srcr += src_pitch * lines_to_process;
      srcp += src_pitch * lines_to_process;
      srcpn += src_pitch * lines_to_process;
      maskp += msk_pitch * lines_to_process;
      dstp += dst_pitch * lines_to_process;
      // bottom
      for (int x = 0; x < width; ++x)
      {
        if (maskp[x] == 0xFF)
          dstp[x] = (srcp[x] + srcpp[x] + 1) >> 1;
        else
          dstp[x] = srcr[x];
      }
    }
    // bottom orphan
    if (field == 1)
      vs_bitblt(vsapi->getWritePtr(dst, plane) + (height - 1)*(dst_pitch >> 1) * sizeof(pixel_t), (dst_pitch >> 1) * sizeof(pixel_t),
        vsapi->getReadPtr(src, plane) + (height - 2)*(src_pitch >> 1) * sizeof(pixel_t), (src_pitch >> 1) * sizeof(pixel_t), rowsize, 1);
  }
}


template<bool with_mask>
void cubicDeintMask_SSE2(const uint8_t *srcp, uint8_t *dstp,
  const uint8_t *maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height)
{
  /*
  if (maskp[x] == 0xFF)
  {
    const int temp = (19 * (srcpp[x] + srcp[x]) - 3 * (srcppp[x] + srcpn[x]) + 16) >> 5;
    if (temp > 255) dstp[x] = 255;
    else if (temp < 0) dstp[x] = 0;
    else dstp[x] = temp;
  }
  else 
    dstp[x] = srcr[x];
  */
  const int s1 = src_pitch >> 1; // pitch was multiplied *2 before the call

  auto zero = _mm_setzero_si128();
  auto threeWord = _mm_set1_epi16(3);
  auto sixteenWord = _mm_set1_epi16(16);
  auto nineteenWord = _mm_set1_epi16(19);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto prevprev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch * 2 + x));
      auto prev = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp - src_pitch + x));
      auto curr = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto next = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + src_pitch + x));
      auto prevprev_lo = _mm_unpacklo_epi8(prevprev, zero);
      auto next_lo = _mm_unpacklo_epi8(next, zero);
      auto prevprev_hi = _mm_unpackhi_epi8(prevprev, zero);
      auto next_hi = _mm_unpackhi_epi8(next, zero);

      auto pp_plus_n_lo = _mm_add_epi16(prevprev_lo, next_lo); // pp_lo + n_lo
      auto pp_plus_n_hi = _mm_add_epi16(prevprev_hi, next_hi); // pp_hi + n_hi
      auto pp_plus_n_mul3_lo = _mm_mullo_epi16(pp_plus_n_lo, threeWord); // *3
      auto pp_plus_n_mul3_hi = _mm_mullo_epi16(pp_plus_n_hi, threeWord);

      auto prev_lo = _mm_unpacklo_epi8(prev, zero);
      auto curr_lo = _mm_unpacklo_epi8(curr, zero);
      auto prev_hi = _mm_unpackhi_epi8(prev, zero);
      auto curr_hi = _mm_unpackhi_epi8(curr, zero);

      auto p_plus_c_lo = _mm_add_epi16(prev_lo, curr_lo); // p_lo + c_lo
      auto p_plus_c_hi = _mm_add_epi16(prev_hi, curr_hi); // p_hi + c_hi
      auto p_plus_c_mul19_lo = _mm_mullo_epi16(p_plus_c_lo, nineteenWord); // *19
      auto p_plus_c_mul19_hi = _mm_mullo_epi16(p_plus_c_hi, nineteenWord);

      auto sub_lo = _mm_subs_epu16(p_plus_c_mul19_lo, pp_plus_n_mul3_lo); // *19 - *3
      auto sub_hi = _mm_subs_epu16(p_plus_c_mul19_hi, pp_plus_n_mul3_hi);

      auto res_lo = _mm_srli_epi16(_mm_add_epi16(sub_lo, sixteenWord), 5); // +16, >> 5
      auto res_hi = _mm_srli_epi16(_mm_add_epi16(sub_hi, sixteenWord), 5);
      auto res = _mm_packus_epi16(res_lo, res_hi);

      if constexpr (with_mask) {
        auto mask = _mm_load_si128(reinterpret_cast<const __m128i*>(maskp + x));
        // s1 is the normal src_pitch (half of the doubled)
        auto curr2 = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp - s1 + x)); // == srcp - s1 + x
        res = _MM_BLENDV_EPI8(curr2, res, mask); // if mask then res else curr
      } else {
          (void)maskp;
          (void)msk_pitch;
      }
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dstp += dst_pitch;
    if constexpr(with_mask)
      maskp += msk_pitch;
  }
}

template<typename pixel_t, int bits_per_pixel, bool with_mask>
void cubicDeintMask_C(const pixel_t* srcp, pixel_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height)
{
  while (height--) {
    const pixel_t* srcppp = srcp - src_pitch * 2;
    const pixel_t* srcpp = srcp - src_pitch;
    const pixel_t* srcpn = srcp + src_pitch;
    const pixel_t* srcr = srcp - (src_pitch >> 1); // came doubled
    for (int x = 0; x < width; ++x)
    {
      if (!with_mask || (with_mask && maskp[x] == 0xFF))
      {
        const int temp = cubicInt<bits_per_pixel>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
        dstp[x] = temp;
      }
      else 
        dstp[x] = srcr[x];
    }
    srcp += src_pitch;
    dstp += dst_pitch;
    if constexpr (with_mask)
      maskp += msk_pitch;
  }
}


//void TFMPP::destroyHint(VSFrameRef *dst, unsigned int hint)
//{
//  if (vi->format->bytesPerSample == 1)
//    destroyHint_core<uint8_t>(dst, hint);
//  else
//    destroyHint_core<uint16_t>(dst, hint);
//}

//template<typename pixel_t>
//void TFMPP::destroyHint_core(VSFrameRef *dst, unsigned int hint)
//{
//  pixel_t* p = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, 0));
//  if (hint & 0x80)
//  {
//    hint >>= 8;
//    for (int i = 0; i < 32; ++i)
//    {
//      *p &= ~1;
//      *p++ |= ((MAGIC_NUMBER_2 & (1 << i)) >> i);
//    }
//    for (int i = 0; i < 32; ++i)
//    {
//      *p &= ~1;
//      *p++ |= ((hint & (1 << i)) >> i);
//    }
//  }
//  else
//  {
//    for (int i = 0; i < 64; ++i)
//      *p++ &= ~1;
//  }
//}

//void TFMPP::putHint(VSFrameRef *dst, int field, unsigned int hint)
//{
//  if (vi->format->bytesPerSample == 1)
//    return putHint_core<uint8_t>(dst, field, hint);
//  else
//    return putHint_core<uint16_t>(dst, field, hint);
//}

//template<typename pixel_t>
//void TFMPP::putHint_core(VSFrameRef *dst, int field, unsigned int hint)
//{
//  pixel_t *p = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, 0));
//  unsigned int i;
//  hint &= (D2VFILM | 0xFF80);
//  if (field == 1)
//  {
//    hint |= TOP_FIELD;
//    hint |= ISDT;
//  }
//  else hint |= ISDB;
//  for (i = 0; i < 32; ++i)
//  {
//    *p &= ~1;
//    *p++ |= ((MAGIC_NUMBER & (1 << i)) >> i);
//  }
//  for (i = 0; i < 32; ++i)
//  {
//    *p &= ~1;
//    *p++ |= ((hint & (1 << i)) >> i);
//  }
//}

void TFMPP::getProperties(const VSFrameRef *src, int& field, bool& combed) const
{
    field = -1; combed = false;

    const VSMap *props = vsapi->getFramePropsRO(src);

    if (vsapi->propNumElements(props, PROP_TFMField) == 1)
        field = int64ToIntS(vsapi->propGetInt(props, PROP_TFMField, 0, nullptr));

    if (vsapi->propNumElements(props, PROP_Combed) == 1)
        combed = !!vsapi->propGetInt(props, PROP_Combed, 0, nullptr);
}

//template<typename pixel_t>
//bool TFMPP::getHint_core(const VSFrameRef *src, int &field, bool &combed, unsigned int &hint)
//{
//  field = -1; combed = false; hint = 0;
//  const pixel_t *srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, 0));
//  unsigned int i, magic_number = 0;
//  for (i = 0; i < 32; ++i)
//  {
//    magic_number |= ((*srcp++ & 1) << i);
//  }
//  if (magic_number != MAGIC_NUMBER) return false;
//  for (i = 0; i < 32; ++i)
//  {
//    hint |= ((*srcp++ & 1) << i);
//  }
//  if (hint & 0xFFFF0000) return false;
//  if (hint&TOP_FIELD) field = 1;
//  else field = 0;
//  if (hint&COMBED) combed = true;
//  int value = hint & 0x07;
//  if (value == 5) { combed = true; field = 0; }
//  else if (value == 6) { combed = true; field = 1; }
//  return true;
//}

void TFMPP::getSetOvr(int n)
{
  if (setArray.size() == 0) return;
  mthresh = mthresh_origSaved;
  PP = PP_origSaved;
  for (int x = 0; x < (int)setArray.size(); x += 4)
  {
    if (n >= setArray[x + 1] && n <= setArray[x + 2])
    {
      if (setArray[x] == 80) PP = setArray[x + 3]; // P
      else if (setArray[x] == 77) mthresh = setArray[x + 3]; // M
    }
  }
}

void TFMPP::copyField(VSFrameRef *dst, const VSFrameRef *src, int field) const
{
  // bit depth independent
    const VSFormat *format = vsapi->getFrameFormat(src);
  const int np = format->numPlanes;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    const int dst_pitch = vsapi->getStride(dst, plane);
    const int src_pitch = vsapi->getStride(src, plane);
    uint8_t *dstp = vsapi->getWritePtr(dst, plane);
    const uint8_t *srcp = vsapi->getReadPtr(src, plane);
    const int width = vsapi->getFrameWidth(src, plane);
    const int height = vsapi->getFrameHeight(src, plane);
    if (field == 0)
      vs_bitblt(dstp, dst_pitch, srcp + src_pitch,
        src_pitch, width * format->bytesPerSample, 1);
    vs_bitblt(dstp + dst_pitch *(1 - field),
      dst_pitch * 2, srcp + src_pitch *(1 - field),
      src_pitch * 2, width * format->bytesPerSample, height >> 1);
    if (field == 1)
      vs_bitblt(dstp + dst_pitch *(height - 1),
        dst_pitch, srcp + src_pitch *(height - 2),
        src_pitch, width * format->bytesPerSample, 1);
  }
}

void TFMPP::writeDisplay(VSFrameRef *dst, int n, int field) const
{
#define SZ 160
    char buf[SZ];

    std::string text = "TFMPP " VERSION " by tritical\n";

  snprintf(buf, SZ, "field = %d  PP = %d  mthresh = %d ", field, PP, mthresh);
  text += buf;

  snprintf(buf, SZ, "frame: %d  (COMBED - DEINTERLACED)! ", n);
  text += buf;
#undef SZ

  VSMap *props = vsapi->getFramePropsRW(dst);
  vsapi->propSetData(props, PROP_TFMDisplay, text.c_str(), text.size(), paReplace);
}

void TFMPP::elaDeint(VSFrameRef *dst, const VSFrameRef* mask, const VSFrameRef *src, bool nomask, int field) const
{
    switch (vi->format->bitsPerSample) {
    case 8: elaDeintPlanar<uint8_t, 8>(dst, mask, src, nomask, field); break;
    case 10: elaDeintPlanar<uint16_t, 10>(dst, mask, src, nomask, field); break;
    case 12: elaDeintPlanar<uint16_t, 12>(dst, mask, src, nomask, field); break;
    case 14: elaDeintPlanar<uint16_t, 14>(dst, mask, src, nomask, field); break;
    case 16: elaDeintPlanar<uint16_t, 16>(dst, mask, src, nomask, field); break;
    }
}

// totally different from TDeinterlace ELADeintPlanar
template<typename pixel_t, int bits_per_pixel>
void TFMPP::elaDeintPlanar(VSFrameRef *dst, const VSFrameRef *mask, const VSFrameRef *src, bool nomask, int field) const
{
  const pixel_t *srcpY = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, 0));
  const pixel_t *srcpV = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, 2));
  const pixel_t *srcpU = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, 1));
  int src_pitchY = vsapi->getStride(src, 0) / sizeof(pixel_t);
  int src_pitchUV = vsapi->getStride(src, 2) / sizeof(pixel_t);
  
  const int WidthY = vsapi->getFrameWidth(src, 0);
  const int WidthUV = vsapi->getFrameWidth(src, 2);
  const int HeightY = vsapi->getFrameHeight(src, 0);
  const int HeightUV = vsapi->getFrameHeight(src, 2);
  
  pixel_t *dstpY = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, 0));
  pixel_t *dstpV = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, 2));
  pixel_t *dstpU = reinterpret_cast<pixel_t*>(vsapi->getWritePtr(dst, 1));
  int dst_pitchY = vsapi->getStride(dst, 0) / sizeof(pixel_t);
  int dst_pitchUV = vsapi->getStride(dst, 2) / sizeof(pixel_t);

  const uint8_t *maskpY = vsapi->getReadPtr(mask, 0);
  const uint8_t *maskpV = vsapi->getReadPtr(mask, 2);
  const uint8_t *maskpU = vsapi->getReadPtr(mask, 1);
  int mask_pitchY = vsapi->getStride(mask, 0);
  int mask_pitchUV = vsapi->getStride(mask, 2);

  srcpY += src_pitchY*(3 - field);
  srcpV += src_pitchUV*(3 - field);
  srcpU += src_pitchUV*(3 - field);
  dstpY += dst_pitchY*(2 - field);
  dstpV += dst_pitchUV*(2 - field);
  dstpU += dst_pitchUV*(2 - field);
  maskpY += mask_pitchY*(2 - field);
  maskpV += mask_pitchUV*(2 - field);
  maskpU += mask_pitchUV*(2 - field);
  src_pitchY <<= 1;
  src_pitchUV <<= 1;
  dst_pitchY <<= 1;
  dst_pitchUV <<= 1;
  mask_pitchY <<= 1;
  mask_pitchUV <<= 1;

  const pixel_t *srcppY = srcpY - src_pitchY;
  const pixel_t *srcpppY = srcppY - src_pitchY;
  const pixel_t *srcpnY = srcpY + src_pitchY;
  const pixel_t *srcppV = srcpV - src_pitchUV;
  const pixel_t *srcpppV = srcppV - src_pitchUV;
  const pixel_t *srcpnV = srcpV + src_pitchUV;
  const pixel_t *srcppU = srcpU - src_pitchUV;
  const pixel_t *srcpppU = srcppU - src_pitchUV;
  const pixel_t *srcpnU = srcpU + src_pitchUV;
  int stopx = WidthY;
  int startxuv = 0;
  int x, y;
  int stopxuv = WidthUV;
  int Iy1, Iy2, Iye;
  int Ix1, Ix2;
  int edgeS1, edgeS2;
  int sum, sumsq;
  int temp, temp1, temp2;
  int minN, maxN;
  double dir1, dir2, dir, dirF;

  constexpr int bitshift_to_8 = (bits_per_pixel - 8);

  auto square = [](int i)
  {
    return i * i;
  };

  for (y = 2 - field; y < HeightY - 1; y += 2)
  {
    for (x = 0; x < stopx; ++x)
    {
      if (nomask || maskpY[x] == 0xFF)
      {
        if (y > 2 && y < HeightY - 3 && x>3 && x < WidthY - 4)
        {
          // stay in safe 32 bit int by using 8 bit normalized data
          Iy1 = (-srcpY[x - 1] - srcpY[x] - srcpY[x] - srcpY[x + 1] + srcpppY[x - 1] + srcpppY[x] + srcpppY[x] + srcpppY[x + 1]) >> bitshift_to_8;
          Iy2 = (-srcpnY[x - 1] - srcpnY[x] - srcpnY[x] - srcpnY[x + 1] + srcppY[x - 1] + srcppY[x] + srcppY[x] + srcppY[x + 1]) >> bitshift_to_8;
          Ix1 = (srcpppY[x + 1] + srcppY[x + 1] + srcppY[x + 1] + srcpY[x + 1] - srcpppY[x - 1] - srcppY[x - 1] - srcppY[x - 1] - srcpY[x - 1]) >> bitshift_to_8;
          Ix2 = (srcppY[x + 1] + srcpY[x + 1] + srcpY[x + 1] + srcpnY[x + 1] - srcppY[x - 1] - srcpY[x - 1] - srcpY[x - 1] - srcpnY[x - 1]) >> bitshift_to_8;
          edgeS1 = Ix1 * Ix1 + Iy1 * Iy1;
          edgeS2 = Ix2 * Ix2 + Iy2 * Iy2;
          if (edgeS1 < 1600 && edgeS2 < 1600)
          {
            dstpY[x] = (srcppY[x] + srcpY[x] + 1) >> 1;
            continue;
          }
          constexpr int Const10 = 10 << bitshift_to_8;
          if (abs(srcppY[x] - srcpY[x]) < Const10 && (edgeS1 < 1600 || edgeS2 < 1600))
          {
            dstpY[x] = (srcppY[x] + srcpY[x] + 1) >> 1;
            continue;
          }
          // stay in safe 32 bit int by using 8 bit normalized data
          sum = (srcppY[x - 1] + srcppY[x] + srcppY[x + 1] + srcpY[x - 1] + srcpY[x] + srcpY[x + 1]) >> bitshift_to_8;
          sumsq =
            square(srcppY[x - 1] >> bitshift_to_8) +
            square(srcppY[x] >> bitshift_to_8) +
            square(srcppY[x + 1] >> bitshift_to_8) +
            square(srcpY[x - 1] >> bitshift_to_8) +
            square(srcpY[x] >> bitshift_to_8) +
            square(srcpY[x + 1] >> bitshift_to_8);
          if (6 * sumsq - square(sum) < 432)
          {
            dstpY[x] = (srcppY[x] + srcpY[x] + 1) >> 1;
            continue;
          }
          if (Ix1 == 0) dir1 = 3.1415926;
          else
          {
            dir1 = atan(Iy1 / (Ix1*2.0f)) + 1.5707963;
            if (Iy1 >= 0) { if (Ix1 < 0) dir1 += 3.1415927; }
            else { if (Ix1 >= 0) dir1 += 3.1415927; }
            if (dir1 >= 3.1415927) dir1 -= 3.1415927;
          }
          if (Ix2 == 0) dir2 = 3.1415926;
          else
          {
            dir2 = atan(Iy2 / (Ix2*2.0f)) + 1.5707963;
            if (Iy2 >= 0) { if (Ix2 < 0) dir2 += 3.1415927; }
            else { if (Ix2 >= 0) dir2 += 3.1415927; }
            if (dir2 >= 3.1415927) dir2 -= 3.1415927;
          }
          if (fabs(dir1 - dir2) < 0.5)
          {
            if (edgeS1 >= 3600 && edgeS2 >= 3600) dir = (dir1 + dir2) * 0.5;
            else dir = edgeS1 >= edgeS2 ? dir1 : dir2;
          }
          else
          {
            if (edgeS1 >= 5000 && edgeS2 >= 5000)
            {
              // stay in safe 32 bit int by using 8 bit normalized data
              Iye = (-srcpY[x - 1] - srcpY[x] - srcpY[x] - srcpY[x + 1] + srcppY[x - 1] + srcppY[x] + srcppY[x] + srcppY[x + 1]) >> bitshift_to_8;
              if ((Iy1*Iye > 0) && (Iy2*Iye < 0)) dir = dir1;
              else if ((Iy1*Iye < 0) && (Iy2*Iye > 0)) dir = dir2;
              else
              {
                if (abs(Iye - Iy1) <= abs(Iye - Iy2)) dir = dir1;
                else dir = dir2;
              }
            }
            else dir = edgeS1 >= edgeS2 ? dir1 : dir2;
          }
          dirF = 0.5f / tan(dir);
          if (dirF >= 0.0f)
          {
            if (dirF >= 0.5f)
            {
              if (dirF >= 1.0f)
              {
                if (dirF >= 1.5f)
                {
                  if (dirF >= 2.0f)
                  {
                    if (dirF <= 2.50f)
                    {
                      temp1 = srcppY[x + 4];
                      temp2 = srcpY[x - 4];
                      temp = (srcppY[x + 4] + srcpY[x - 4] + 1) >> 1;
                    }
                    else
                    {
                      temp1 = temp2 = srcpY[x];
                      temp = cubicInt<bits_per_pixel>(srcpppY[x], srcppY[x], srcpY[x], srcpnY[x]);
                    }
                  }
                  else
                  {
                    temp1 = (int)((dirF - 1.5f)*(srcppY[x + 4]) + (2.0f - dirF)*(srcppY[x + 3]) + 0.5f);
                    temp2 = (int)((dirF - 1.5f)*(srcpY[x - 4]) + (2.0f - dirF)*(srcpY[x - 3]) + 0.5f);
                    temp = (int)((dirF - 1.5f)*(srcppY[x + 4] + srcpY[x - 4]) + (2.0f - dirF)*(srcppY[x + 3] + srcpY[x - 3]) + 0.5f);
                  }
                }
                else
                {
                  temp1 = (int)((dirF - 1.0f)*(srcppY[x + 3]) + (1.5f - dirF)*(srcppY[x + 2]) + 0.5f);
                  temp2 = (int)((dirF - 1.0f)*(srcpY[x - 3]) + (1.5f - dirF)*(srcpY[x - 2]) + 0.5f);
                  temp = (int)((dirF - 1.0f)*(srcppY[x + 3] + srcpY[x - 3]) + (1.5f - dirF)*(srcppY[x + 2] + srcpY[x - 2]) + 0.5f);
                }
              }
              else
              {
                temp1 = (int)((dirF - 0.5f)*(srcppY[x + 2]) + (1.0f - dirF)*(srcppY[x + 1]) + 0.5f);
                temp2 = (int)((dirF - 0.5f)*(srcpY[x - 2]) + (1.0f - dirF)*(srcpY[x - 1]) + 0.5f);
                temp = (int)((dirF - 0.5f)*(srcppY[x + 2] + srcpY[x - 2]) + (1.0f - dirF)*(srcppY[x + 1] + srcpY[x - 1]) + 0.5f);
              }
            }
            else
            {
              temp1 = (int)(dirF*(srcppY[x + 1]) + (0.5f - dirF)*(srcppY[x]) + 0.5f);
              temp2 = (int)(dirF*(srcpY[x - 1]) + (0.5f - dirF)*(srcpY[x]) + 0.5f);
              temp = (int)(dirF*(srcppY[x + 1] + srcpY[x - 1]) + (0.5f - dirF)*(srcppY[x] + srcpY[x]) + 0.5f);
            }
          }
          else
          {
            if (dirF <= -0.5f)
            {
              if (dirF <= -1.0f)
              {
                if (dirF <= -1.5f)
                {
                  if (dirF <= -2.0f)
                  {
                    if (dirF >= -2.50f)
                    {
                      temp1 = srcppY[x - 4];
                      temp2 = srcpY[x + 4];
                      temp = (srcppY[x - 4] + srcpY[x + 4] + 1) >> 1;
                    }
                    else
                    {
                      temp1 = temp2 = srcpY[x];
                      temp = cubicInt<bits_per_pixel>(srcpppY[x], srcppY[x], srcpY[x], srcpnY[x]);
                    }
                  }
                  else
                  {
                    temp1 = (int)((-dirF - 1.5f)*(srcppY[x - 4]) + (2.0f + dirF)*(srcppY[x - 3]) + 0.5f);
                    temp2 = (int)((-dirF - 1.5f)*(srcpY[x + 4]) + (2.0f + dirF)*(srcpY[x + 3]) + 0.5f);
                    temp = (int)((-dirF - 1.5f)*(srcppY[x - 4] + srcpY[x + 4]) + (2.0f + dirF)*(srcppY[x - 3] + srcpY[x + 3]) + 0.5f);
                  }
                }
                else
                {
                  temp1 = (int)((-dirF - 1.0f)*(srcppY[x - 3]) + (1.5f + dirF)*(srcppY[x - 2]) + 0.5f);
                  temp2 = (int)((-dirF - 1.0f)*(srcpY[x + 3]) + (1.5f + dirF)*(srcpY[x + 2]) + 0.5f);
                  temp = (int)((-dirF - 1.0f)*(srcppY[x - 3] + srcpY[x + 3]) + (1.5f + dirF)*(srcppY[x - 2] + srcpY[x + 2]) + 0.5f);
                }
              }
              else
              {
                temp1 = (int)((-dirF - 0.5f)*(srcppY[x - 2]) + (1.0f + dirF)*(srcppY[x - 1]) + 0.5f);
                temp2 = (int)((-dirF - 0.5f)*(srcpY[x + 2]) + (1.0f + dirF)*(srcpY[x + 1]) + 0.5f);
                temp = (int)((-dirF - 0.5f)*(srcppY[x - 2] + srcpY[x + 2]) + (1.0f + dirF)*(srcppY[x - 1] + srcpY[x + 1]) + 0.5f);
              }
            }
            else
            {
              temp1 = (int)((-dirF)*(srcppY[x - 1]) + (0.5f + dirF)*(srcppY[x]) + 0.5f);
              temp2 = (int)((-dirF)*(srcpY[x + 1]) + (0.5f + dirF)*(srcpY[x]) + 0.5f);
              temp = (int)((-dirF)*(srcppY[x - 1] + srcpY[x + 1]) + (0.5f + dirF)*(srcppY[x] + srcpY[x]) + 0.5f);
            }
          }

          constexpr int Const20 = 20 << bitshift_to_8;
          constexpr int Const25 = 25 << bitshift_to_8;
          constexpr int Const60 = 60 << bitshift_to_8;

          minN = std::min(srcppY[x], srcpY[x]) - Const25;
          maxN = std::max(srcppY[x], srcpY[x]) + Const25;
          if (abs(temp1 - temp2) > Const20 || abs(srcppY[x] + srcpY[x] - temp - temp) > Const60 || temp < minN || temp > maxN)
          {
            temp = cubicInt<bits_per_pixel>(srcpppY[x], srcppY[x], srcpY[x], srcpnY[x]);
          }
          else {
            // clamp to valid. cubicint clamps O.K.
            constexpr int max_pixel_value = (1 << bits_per_pixel) - 1;
            if (temp > max_pixel_value) temp = max_pixel_value;
            else if (temp < 0) temp = 0;
          }
          dstpY[x] = temp;
        }
        else
        {
          if (y<3 || y>HeightY - 4) dstpY[x] = ((srcpY[x] + srcppY[x] + 1) >> 1);
          else dstpY[x] = cubicInt<bits_per_pixel>(srcpppY[x], srcppY[x], srcpY[x], srcpnY[x]);
        }
      }
    }
    srcpppY = srcppY;
    srcppY = srcpY;
    srcpY = srcpnY;
    srcpnY += src_pitchY;
    maskpY += mask_pitchY;
    dstpY += dst_pitchY;
  }
  for (y = 2 - field; y < HeightUV - 1; y += 2)
  {
    for (x = startxuv; x < stopxuv; ++x)
    {
      if (nomask || maskpV[x] == 0xFF)
      {
        if (y<3 || y>HeightUV - 4) dstpV[x] = ((srcpV[x] + srcppV[x] + 1) >> 1);
        else dstpV[x] = cubicInt<bits_per_pixel>(srcpppV[x], srcppV[x], srcpV[x], srcpnV[x]);
      }
      if (nomask || maskpU[x] == 0xFF)
      {
        if (y<3 || y>HeightUV - 4) dstpU[x] = ((srcpU[x] + srcppU[x] + 1) >> 1);
        else dstpU[x] = cubicInt<bits_per_pixel>(srcpppU[x], srcppU[x], srcpU[x], srcpnU[x]);
      }
    }
    srcpppV = srcppV;
    srcppV = srcpV;
    srcpV = srcpnV;
    srcpnV += src_pitchUV;
    srcpppU = srcppU;
    srcppU = srcpU;
    srcpU = srcpnU;
    srcpnU += src_pitchUV;
    maskpV += mask_pitchUV;
    maskpU += mask_pitchUV;
    dstpV += dst_pitchUV;
    dstpU += dst_pitchUV;
  }
}

//void TFMPP::elaDeintYUY2(const VSFrameRef *dst, const VSFrameRef *mask, const VSFrameRef *src, bool nomask, int field)
//{
//  const uint8_t *srcp = src->GetReadPtr();
//  int src_pitch = src->GetPitch();
//  int Width = src->GetRowSize();
//  int Height = src->GetHeight();
//  uint8_t *dstp = dst->GetWritePtr();
//  int dst_pitch = dst->GetPitch();
//  const uint8_t *maskp = mask->GetPtr();
//  int mask_pitch = mask->GetPitch();
//  srcp += src_pitch*(3 - field);
//  dstp += dst_pitch*(2 - field);
//  maskp += mask_pitch*(2 - field);
//  src_pitch <<= 1;
//  dst_pitch <<= 1;
//  mask_pitch <<= 1;
//  const uint8_t *srcpp = srcp - src_pitch;
//  const uint8_t *srcppp = srcpp - src_pitch;
//  const uint8_t *srcpn = srcp + src_pitch;
//  int stopx = Width;
//  int Iy1, Iy2, Iye, Ix1, Ix2, edgeS1, edgeS2, sum, sumsq, temp, temp1, temp2, minN, maxN, x, y;
//  double dir1, dir2, dir, dirF;
//  for (y = 2 - field; y < Height - 1; y += 2)
//  {
//    for (x = 0; x < stopx; ++x)
//    {
//      if (nomask || maskp[x] == 0xFF)
//      {
//        if (y > 2 && y < Height - 3 && x>7 && x < Width - 9)
//        {
//          Iy1 = -srcp[x - 2] - srcp[x] - srcp[x] - srcp[x + 2] + srcppp[x - 2] + srcppp[x] + srcppp[x] + srcppp[x + 2];
//          Iy2 = -srcpn[x - 2] - srcpn[x] - srcpn[x] - srcpn[x + 2] + srcpp[x - 2] + srcpp[x] + srcpp[x] + srcpp[x + 2];
//          Ix1 = srcppp[x + 2] + srcpp[x + 2] + srcpp[x + 2] + srcp[x + 2] - srcppp[x - 2] - srcpp[x - 2] - srcpp[x - 2] - srcp[x - 2];
//          Ix2 = srcpp[x + 2] + srcp[x + 2] + srcp[x + 2] + srcpn[x + 2] - srcpp[x - 2] - srcp[x - 2] - srcp[x - 2] - srcpn[x - 2];
//          edgeS1 = Ix1*Ix1 + Iy1*Iy1;
//          edgeS2 = Ix2*Ix2 + Iy2*Iy2;
//          if (edgeS1 < 1600 && edgeS2 < 1600)
//          {
//            dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
//            goto chromajump;
//          }
//          if (abs(srcpp[x] - srcp[x]) < 10 && (edgeS1 < 1600 || edgeS2 < 1600))
//          {
//            dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
//            goto chromajump;
//          }
//          sum = srcpp[x - 2] + srcpp[x] + srcpp[x + 2] + srcp[x - 2] + srcp[x] + srcp[x + 2];
//          sumsq = srcpp[x - 2] * srcpp[x - 2] + srcpp[x] * srcpp[x] + srcpp[x + 2] * srcpp[x + 2] +
//            srcp[x - 2] * srcp[x - 2] + srcp[x] * srcp[x] + srcp[x + 2] * srcp[x + 2];
//          if ((6 * sumsq - sum*sum) < 432)
//          {
//            dstp[x] = (srcpp[x] + srcp[x] + 1) >> 1;
//            goto chromajump;
//          }
//          if (Ix1 == 0) dir1 = 3.1415926;
//          else
//          {
//            dir1 = atan(Iy1 / (Ix1*2.0f)) + 1.5707963;
//            if (Iy1 >= 0) { if (Ix1 < 0) dir1 += 3.1415927; }
//            else { if (Ix1 >= 0) dir1 += 3.1415927; }
//            if (dir1 >= 3.1415927) dir1 -= 3.1415927;
//          }
//          if (Ix2 == 0) dir2 = 3.1415926;
//          else
//          {
//            dir2 = atan(Iy2 / (Ix2*2.0f)) + 1.5707963;
//            if (Iy2 >= 0) { if (Ix2 < 0) dir2 += 3.1415927; }
//            else { if (Ix2 >= 0) dir2 += 3.1415927; }
//            if (dir2 >= 3.1415927) dir2 -= 3.1415927;
//          }
//          if (fabs(dir1 - dir2) < 0.5f)
//          {
//            if (edgeS1 >= 3600 && edgeS2 >= 3600) dir = (dir1 + dir2) * 0.5f;
//            else dir = edgeS1 >= edgeS2 ? dir1 : dir2;
//          }
//          else
//          {
//            if (edgeS1 >= 5000 && edgeS2 >= 5000)
//            {
//              Iye = -srcp[x - 2] - srcp[x] - srcp[x] - srcp[x + 2] + srcpp[x - 2] + srcpp[x] + srcpp[x] + srcpp[x + 2];
//              if ((Iy1*Iye > 0) && (Iy2*Iye < 0)) dir = dir1;
//              else if ((Iy1*Iye < 0) && (Iy2*Iye > 0)) dir = dir2;
//              else
//              {
//                if (abs(Iye - Iy1) <= abs(Iye - Iy2)) dir = dir1;
//                else dir = dir2;
//              }
//            }
//            else dir = edgeS1 >= edgeS2 ? dir1 : dir2;
//          }
//          dirF = 0.5f / tan(dir);
//          if (dirF >= 0.0f)
//          {
//            if (dirF >= 0.5f)
//            {
//              if (dirF >= 1.0f)
//              {
//                if (dirF >= 1.5f)
//                {
//                  if (dirF >= 2.0f)
//                  {
//                    if (dirF <= 2.50f)
//                    {
//                      temp1 = srcpp[x + 8];
//                      temp2 = srcp[x - 8];
//                      temp = (srcpp[x + 8] + srcp[x - 8] + 1) >> 1;
//                    }
//                    else
//                    {
//                      temp1 = temp2 = srcp[x];
//                      temp = cubicInt<8>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
//                    }
//                  }
//                  else
//                  {
//                    temp1 = (int)((dirF - 1.5f)*(srcpp[x + 8]) + (2.0f - dirF)*(srcpp[x + 6]) + 0.5f);
//                    temp2 = (int)((dirF - 1.5f)*(srcp[x - 8]) + (2.0f - dirF)*(srcp[x - 6]) + 0.5f);
//                    temp = (int)((dirF - 1.5f)*(srcpp[x + 8] + srcp[x - 8]) + (2.0f - dirF)*(srcpp[x + 6] + srcp[x - 6]) + 0.5f);
//                  }
//                }
//                else
//                {
//                  temp1 = (int)((dirF - 1.0f)*(srcpp[x + 6]) + (1.5f - dirF)*(srcpp[x + 4]) + 0.5f);
//                  temp2 = (int)((dirF - 1.0f)*(srcp[x - 6]) + (1.5f - dirF)*(srcp[x - 4]) + 0.5f);
//                  temp = (int)((dirF - 1.0f)*(srcpp[x + 6] + srcp[x - 6]) + (1.5f - dirF)*(srcpp[x + 4] + srcp[x - 4]) + 0.5f);
//                }
//              }
//              else
//              {
//                temp1 = (int)((dirF - 0.5f)*(srcpp[x + 4]) + (1.0f - dirF)*(srcpp[x + 2]) + 0.5f);
//                temp2 = (int)((dirF - 0.5f)*(srcp[x - 4]) + (1.0f - dirF)*(srcp[x - 2]) + 0.5f);
//                temp = (int)((dirF - 0.5f)*(srcpp[x + 4] + srcp[x - 4]) + (1.0f - dirF)*(srcpp[x + 2] + srcp[x - 2]) + 0.5f);
//              }
//            }
//            else
//            {
//              temp1 = (int)(dirF*(srcpp[x + 2]) + (0.5f - dirF)*(srcpp[x]) + 0.5f);
//              temp2 = (int)(dirF*(srcp[x - 2]) + (0.5f - dirF)*(srcp[x]) + 0.5f);
//              temp = (int)(dirF*(srcpp[x + 2] + srcp[x - 2]) + (0.5f - dirF)*(srcpp[x] + srcp[x]) + 0.5f);
//            }
//          }
//          else
//          {
//            if (dirF <= -0.5f)
//            {
//              if (dirF <= -1.0f)
//              {
//                if (dirF <= -1.5f)
//                {
//                  if (dirF <= -2.0f)
//                  {
//                    if (dirF >= -2.50f)
//                    {
//                      temp1 = srcpp[x - 8];
//                      temp2 = srcp[x + 8];
//                      temp = (srcpp[x - 8] + srcp[x + 8] + 1) >> 1;
//                    }
//                    else
//                    {
//                      temp1 = temp2 = srcp[x];
//                      temp = cubicInt<8>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
//                    }
//                  }
//                  else
//                  {
//                    temp1 = (int)((-dirF - 1.5f)*(srcpp[x - 8]) + (2.0f + dirF)*(srcpp[x - 6]) + 0.5f);
//                    temp2 = (int)((-dirF - 1.5f)*(srcp[x + 8]) + (2.0f + dirF)*(srcp[x + 6]) + 0.5f);
//                    temp = (int)((-dirF - 1.5f)*(srcpp[x - 8] + srcp[x + 8]) + (2.0f + dirF)*(srcpp[x - 6] + srcp[x + 6]) + 0.5f);
//                  }
//                }
//                else
//                {
//                  temp1 = (int)((-dirF - 1.0f)*(srcpp[x - 6]) + (1.5f + dirF)*(srcpp[x - 4]) + 0.5f);
//                  temp2 = (int)((-dirF - 1.0f)*(srcp[x + 6]) + (1.5f + dirF)*(srcp[x + 4]) + 0.5f);
//                  temp = (int)((-dirF - 1.0f)*(srcpp[x - 6] + srcp[x + 6]) + (1.5f + dirF)*(srcpp[x - 4] + srcp[x + 4]) + 0.5f);
//                }
//              }
//              else
//              {
//                temp1 = (int)((-dirF - 0.5f)*(srcpp[x - 4]) + (1.0f + dirF)*(srcpp[x - 2]) + 0.5f);
//                temp2 = (int)((-dirF - 0.5f)*(srcp[x + 4]) + (1.0f + dirF)*(srcp[x + 2]) + 0.5f);
//                temp = (int)((-dirF - 0.5f)*(srcpp[x - 4] + srcp[x + 4]) + (1.0f + dirF)*(srcpp[x - 2] + srcp[x + 2]) + 0.5f);
//              }
//            }
//            else
//            {
//              temp1 = (int)((-dirF)*(srcpp[x - 2]) + (0.5f + dirF)*(srcpp[x]) + 0.5f);
//              temp2 = (int)((-dirF)*(srcp[x + 2]) + (0.5f + dirF)*(srcp[x]) + 0.5f);
//              temp = (int)((-dirF)*(srcpp[x - 2] + srcp[x + 2]) + (0.5f + dirF)*(srcpp[x] + srcp[x]) + 0.5f);
//            }
//          }
//          minN = std::min(srcpp[x], srcp[x]) - 25;
//          maxN = std::max(srcpp[x], srcp[x]) + 25;
//          if (abs(temp1 - temp2) > 20 || abs(srcpp[x] + srcp[x] - temp - temp) > 60 || temp < minN || temp > maxN)
//          {
//            temp = cubicInt<8>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
//          }
//          if (temp > 255) temp = 255;
//          else if (temp < 0) temp = 0;
//          dstp[x] = temp;
//        }
//        else
//        {
//          if (y<3 || y>Height - 4) dstp[x] = ((srcp[x] + srcpp[x] + 1) >> 1);
//          else dstp[x] = cubicInt<8>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
//        }
//      }
//    chromajump:
//      ++x;
//      if (nomask || maskp[x] == 0xFF)
//      {
//        if (y<3 || y>Height - 4) dstp[x] = ((srcp[x] + srcpp[x] + 1) >> 1);
//        else dstp[x] = cubicInt<8>(srcppp[x], srcpp[x], srcp[x], srcpn[x]);
//      }
//    }
//    srcppp = srcpp;
//    srcpp = srcp;
//    srcp = srcpn;
//    srcpn += src_pitch;
//    maskp += mask_pitch;
//    dstp += dst_pitch;
//  }
//}

// hbd ready
void TFMPP::maskClip2(const VSFrameRef *src, const VSFrameRef *deint, const VSFrameRef *mask,
  VSFrameRef *dst) const
{
  const bool use_sse2 = cpuFlags.sse2;
  const bool use_sse4 = cpuFlags.sse4_1;

  const uint8_t *srcp, *maskp, *dntp;
  uint8_t *dstp;
  int src_pitch, msk_pitch, dst_pitch, dnt_pitch;

  const int np = vi->format->numPlanes;
  const int pixelsize = vi->format->bytesPerSample;

  for (int b = 0; b < np; ++b)
  {
    const int plane = b;
    srcp = vsapi->getReadPtr(src, plane);
//    const int rowsize = src->GetRowSize(plane); // YUY2: vi.width is not GetRowSize
    const int width = vsapi->getFrameWidth(src, plane);
    const int height = vsapi->getFrameHeight(src, plane);
    src_pitch = vsapi->getStride(src, plane);

    maskp = vsapi->getReadPtr(mask, b);
    msk_pitch = vsapi->getStride(mask, b);

    dntp = vsapi->getReadPtr(deint, plane);
    dnt_pitch = vsapi->getStride(deint, plane);
    dstp = vsapi->getWritePtr(dst, plane);
    dst_pitch = vsapi->getStride(dst, plane);

    using maskClip2_fn_t = decltype(maskClip2_SSE2);
    maskClip2_fn_t* maskClip2_fn;

    if (pixelsize == 1) {
      if (use_sse4)
        maskClip2_fn = maskClip2_SSE4<uint8_t>;
      else if (use_sse2)
        maskClip2_fn = maskClip2_SSE2;
      else
        maskClip2_fn = maskClip2_C<uint8_t>;
    }
    else if (pixelsize == 2) {
      if (use_sse4)
        maskClip2_fn = maskClip2_SSE4<uint16_t>;
      else
        maskClip2_fn = maskClip2_C<uint16_t>;
    }
    else {
      return; // n/a no float support
    }

    maskClip2_fn(srcp, dntp, maskp, dstp, src_pitch, dnt_pitch, msk_pitch, dst_pitch, width, height);
  }
}


template<typename pixel_t>
void maskClip2_C(const uint8_t* srcp, const uint8_t* dntp,
  const uint8_t* maskp, uint8_t* dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height)
{
  for (int y = 0; y < height; ++y)
  {
    for (int x = 0; x < width; ++x)
    {
      if (maskp[x] == 0xFF)
        reinterpret_cast<pixel_t*>(dstp)[x] = reinterpret_cast<const pixel_t*>(dntp)[x];
      else
        reinterpret_cast<pixel_t*>(dstp)[x] = reinterpret_cast<const pixel_t*>(srcp)[x];
    }
    maskp += msk_pitch;
    srcp += src_pitch;
    dntp += dnt_pitch;
    dstp += dst_pitch;
  }
}

template<typename pixel_t>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("sse4.1")))
#endif 
void maskClip2_SSE4(const uint8_t* srcp, const uint8_t* dntp,
  const uint8_t* maskp, uint8_t* dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height)
{
  // mask is always 8 bits 0x00 or 0xFF
  while (height--) {
    for (int x = 0; x < width; x += 16 / sizeof(pixel_t)) {
      __m128i mask;
      if constexpr(sizeof(pixel_t) == 1)
        mask = _mm_load_si128(reinterpret_cast<const __m128i*>(maskp + x));
      else
        mask = _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(maskp + x))); // keep FF to FFFF
      // if mask is FF (FFFF) then use dnt else use src
      auto dnt = _mm_load_si128(reinterpret_cast<const __m128i*>(dntp + x * sizeof(pixel_t)));
      auto src = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp + x * sizeof(pixel_t)));
      auto res = _mm_blendv_epi8(src, dnt, mask); // a, b, mask: if mask then b else a
      _mm_store_si128(reinterpret_cast<__m128i*>(dstp + x * sizeof(pixel_t)), res);
    }
    srcp += src_pitch;
    dntp += dnt_pitch;
    dstp += dst_pitch;
    maskp += msk_pitch;
  }
}

// 8 bit only
void maskClip2_SSE2(const uint8_t *srcp, const uint8_t *dntp,
  const uint8_t *maskp, uint8_t *dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height)
{
  // mask is always 8 bits
  __m128i onesMask = _mm_set1_epi8(-1);
  while (height--) {
    for (int x = 0; x < width; x += 16) {
      auto mask = _mm_load_si128(reinterpret_cast<const __m128i *>(maskp + x));
      // if mask is FF then use dnt else use src
      auto dnt_masked = _mm_and_si128(_mm_load_si128(reinterpret_cast<const __m128i *>(dntp + x)), mask);
      auto src = _mm_load_si128(reinterpret_cast<const __m128i *>(srcp + x));
      auto src_masked = _mm_and_si128(_mm_xor_si128(mask, onesMask), src); // masked with inverse mask
      auto res = _mm_or_si128(src_masked, dnt_masked);
      _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), res);
    }
    srcp += src_pitch;
    dntp += dnt_pitch;
    dstp += dst_pitch;
    maskp += msk_pitch;
  }
}


TFMPP::TFMPP(VSNodeRef *_child, int _PP, int _mthresh, const char* _ovr, bool _display,
  VSNodeRef *_clip2, bool _usehints, int _opt, const VSAPI *_vsapi, VSCore *core)
    : vsapi(_vsapi), child(_child),
  PP(_PP), mthresh(_mthresh), ovr(_ovr), display(_display), clip2(_clip2),
  usehints(_usehints), opt(_opt)
{
    vi = vsapi->getVideoInfo(child);

  mmask = nullptr;

  int w, i, z, b, q, countOvrS;
  char linein[1024], *linep, *linet;
  std::unique_ptr<FILE, decltype (&fclose)> f(nullptr, nullptr);

  cpuFlags = *getCPUFeatures();
  if (opt == 0) memset(&cpuFlags, 0, sizeof(cpuFlags));

  if (vi->format->bitsPerSample > 16)
    throw TIVTCError("TFMPP:  only 8-16 bit formats supported!");
  if (vi->format->sampleType != stInteger)
      throw TIVTCError("TFMPP: only integer formats supported!");
  if (vi->format->colorFamily != cmYUV)
    throw TIVTCError("TFMPP:  YUV data only!");
  if (vi->height & 1 || vi->width & 1)
    throw TIVTCError("TFMPP:  height and width must be divisible by 2!");
  if (PP < 2 || PP > 7)
    throw TIVTCError("TFMPP:  PP must be set to 2, 3, 4, 5, 6, or 7!");
  if (opt < 0 || opt > 4)
    throw TIVTCError("TFMPP:  opt must be set to 0, 1, 2, 3, or 4!");
  if (clip2)
  {
    uC2 = true;
    const VSVideoInfo *vi2 = vsapi->getVideoInfo(clip2);
//    if (vi2.BitsPerComponent() != vi.BitsPerComponent())
//      throw TIVTCError("TFMPP:  clip2 bit depth do not match input clip!!");
//    if (!vi2.IsYUV())
//      throw TIVTCError("TFMPP:  clip2 must be in YUV colorspace!");
    if (vi->format != vi2->format)
      throw TIVTCError("TFMPP:  clip2 colorspace must be the same as input clip!");
    if (vi2->height != vi->height || vi2->width != vi->width)
      throw TIVTCError("TFMPP:  clip2 frame dimensions do not match input clip!");
    if (vi2->numFrames != vi->numFrames)
      throw TIVTCError("TFMPP:  clip2 does not have the same number of frames as input clip!");
  }
  else 
    uC2 = false;

//  child->SetCacheHints(CACHE_GENERIC, 3); // fixed to diameter (07/30/2005)


  nfrms = vi->numFrames - 1;
  PP_origSaved = PP;
  mthresh_origSaved = mthresh;
  i = 0;
  if (ovr.size())
  {
    if ((f = decltype(f) (tivtc_fopen(ovr.c_str(), "r"), &fclose)) != nullptr)
    {
      countOvrS = 0;
      while (fgets(linein, 1024, f.get()) != nullptr)
      {
        if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
          continue;
        linep = linein;
        while (*linep != 'M' && *linep != 'P' && *linep != 0) linep++;
        if (*linep != 0) ++countOvrS;
      }

      if (countOvrS == 0) { goto emptyovrFM; }
      ++countOvrS;
      countOvrS *= 4;
      setArray.resize(countOvrS, 0xffffffff);
      if ((f = decltype(f) (tivtc_fopen(ovr.c_str(), "r"), &fclose)) != nullptr)
      {
        while (fgets(linein, 1024, f.get()) != nullptr)
        {
          if (linein[0] == 0 || linein[0] == '\n' || linein[0] == '\r' || linein[0] == ';' || linein[0] == '#')
            continue;
          linep = linein;
          while (*linep != 0 && *linep != ' ' && *linep != ',') linep++;
          if (*linep == ' ')
          {
            linet = linein;
            while (*linet != 0)
            {
              if (*linet != ' ' && *linet != 10) break;
              linet++;
            }
            if (*linet == 0) { continue; }
            linep++;
            if (*linep == 'M' || *linep == 'P')
            {
              sscanf(linein, "%d", &z);
              if (z<0 || z>nfrms)
              {
                throw TIVTCError("TFMPP:  ovr input error (out of range frame #)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*linep == 'P' || *linep == 'M')
                {
                  q = *linep;
                  linep++;
                  linep++;
                  if (*linep == 0) continue;
                  sscanf(linep, "%d", &b);
                  if (q == 80 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFMPP:  ovr input error (bad PP value)!");
                  }
                  else if (q != 80 && q != 77) continue;
                  setArray[i] = q; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = b; ++i;
                }
              }
            }
          }
          else if (*linep == ',')
          {
            while (*linep != ' ' && *linep != 0) linep++;
            if (*linep == 0) continue;
            linep++;
            if (*linep == 'P' || *linep == 'M')
            {
              sscanf(linein, "%d,%d", &z, &w);
              if (w == 0) w = nfrms;
              if (z<0 || z>nfrms || w<0 || w>nfrms || w < z)
              {
                throw TIVTCError("TFMPP: ovr input error (invalid frame range)!");
              }
              linep = linein;
              while (*linep != ' ' && *linep != 0) linep++;
              if (*linep != 0)
              {
                linep++;
                if (*linep == 'M' || *linep == 'P')
                {
                  q = *linep;
                  linep++;
                  linep++;
                  if (*linep == 0) continue;
                  sscanf(linep, "%d", &b);
                  if (q == 80 && (b < 0 || b > 7))
                  {
                    throw TIVTCError("TFMPP:  ovr input error (bad PP value)!");
                  }
                  else if (q != 77 && q != 80) continue;
                  setArray[i] = q; ++i;
                  setArray[i] = z; ++i;
                  setArray[i] = w; ++i;
                  setArray[i] = b; ++i;
                }
              }
            }
          }
        }
      }
      else {
          throw TIVTCError("TFMPP:  ovr file error (could not open file)!");
      }
    }
    else {
        throw TIVTCError("TFMPP:  ovr input error (could not open ovr file)!");
    }
  }
emptyovrFM:
  mmask = vsapi->newVideoFrame(vi->format, vi->width, vi->height, nullptr, core);
}

TFMPP::~TFMPP()
{
  if (mmask) vsapi->freeFrame(mmask);

  vsapi->freeNode(child);
  vsapi->freeNode(clip2);
}
07070100000014000081A4000000000000000000000001671240C9000017B8000000000000000000000000000000000000002B00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMPP.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <string>
#include <vector>
#include <math.h>
#include <VapourSynth.h>
#include "cpufeatures.h"
#ifdef VERSION
#undef VERSION
#endif
#define VERSION "v1.0.3"

template<typename pixel_t>
void maskClip2_C(const uint8_t* srcp, const uint8_t* dntp,
  const uint8_t* maskp, uint8_t* dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height);

void maskClip2_SSE2(const uint8_t* srcp, const uint8_t* dntp,
  const uint8_t* maskp, uint8_t* dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height);

template<typename pixel_t>
#if defined(GCC) || defined(CLANG)
__attribute__((__target__("sse4.1")))
#endif 
void maskClip2_SSE4(const uint8_t* srcp, const uint8_t* dntp,
  const uint8_t* maskp, uint8_t* dstp, int src_pitch, int dnt_pitch,
  int msk_pitch, int dst_pitch, int width, int height);

template<bool with_mask>
void blendDeintMask_SSE2(const uint8_t* srcp, uint8_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height);

template<typename pixel_t, bool with_mask>
void blendDeintMask_C(const pixel_t* srcp, pixel_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height);

template<bool with_mask>
void cubicDeintMask_SSE2(const uint8_t* srcp, uint8_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height);

template<typename pixel_t, int bits_per_pixel, bool with_mask>
void cubicDeintMask_C(const pixel_t* srcp, pixel_t* dstp,
  const uint8_t* maskp, int src_pitch, int dst_pitch, int msk_pitch,
  int width, int height);

class TFMPP
{
private:
    const VSAPI *vsapi;
    VSNodeRef *child;

  CPUFeatures cpuFlags;

  int PP, mthresh;
  std::string ovr;
  bool display;
  VSNodeRef *clip2;
  bool usehints;
  int opt;
  bool uC2; // use clip2
  int PP_origSaved;
  int mthresh_origSaved;
  int nfrms;
  std::vector<int> setArray;
  VSFrameRef *mmask;

  void buildMotionMask(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
    VSFrameRef *mask, int use) const;
  template<typename pixel_t>
  void buildMotionMask_core(const VSFrameRef *prv, const VSFrameRef *src, const VSFrameRef *nxt,
    VSFrameRef* mask, int use) const;
  void maskClip2(const VSFrameRef *src, const VSFrameRef *deint, const VSFrameRef *mask,
    VSFrameRef *dst) const;

//  void putHint(VSFrameRef *dst, int field, unsigned int hint);
//  template<typename pixel_t>
//  void putHint_core(VSFrameRef *dst, int field, unsigned int hint);
  void getProperties(const VSFrameRef *src, int& field, bool& combed) const;
//  template<typename pixel_t>
//  bool getHint_core(const VSFrameRef *src, int& field, bool& combed, unsigned int& hint);

  void getSetOvr(int n);

//  void denoiseYUY2(VSFrameRef *mask);
  void denoisePlanar(VSFrameRef *mask) const;

//  void linkYUY2(VSFrameRef *mask);
  template<int planarType>
  void linkPlanar(VSFrameRef *mask) const;

//  void destroyHint(VSFrameRef *dst, unsigned int hint);
//  template<typename pixel_t>
//  void destroyHint_core(VSFrameRef *dst, unsigned int hint);

  void BlendDeint(const VSFrameRef *src, const VSFrameRef *mask, VSFrameRef *dst,
    bool nomask) const;
  template<typename pixel_t>
  void BlendDeint_core(const VSFrameRef *src, const VSFrameRef* mask, VSFrameRef *dst,
    bool nomask) const;

  void CubicDeint(const VSFrameRef *src, const VSFrameRef *mask, VSFrameRef *dst, bool nomask,
    int field) const;
  template<typename pixel_t, int bits_per_pixel>
  void CubicDeint_core(const VSFrameRef *src, const VSFrameRef* mask, VSFrameRef *dst, bool nomask,
    int field) const;

  void elaDeint(VSFrameRef *dst, const VSFrameRef *mask, const VSFrameRef *src, bool nomask, int field) const;
  // not the same as in tdeinterlace.
  template<typename pixel_t, int bits_per_pixel>
  void elaDeintPlanar(VSFrameRef *dst, const VSFrameRef *mask, const VSFrameRef *src, bool nomask, int field) const;
//  void elaDeintYUY2(VSFrameRef *dst, const VSFrameRef *mask, const VSFrameRef *src, bool nomask, int field);

  void copyField(VSFrameRef *dst, const VSFrameRef *src, int field) const;
  void buildMotionMask1_SSE2(const uint8_t *srcp1, const uint8_t *srcp2,
    uint8_t *dstp, int s1_pitch, int s2_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpu) const;
  void buildMotionMask2_SSE2(const uint8_t *srcp1, const uint8_t *srcp2,
    const uint8_t *srcp3, uint8_t *dstp, int s1_pitch, int s2_pitch,
    int s3_pitch, int dst_pitch, int width, int height, const CPUFeatures *cpu) const;

  void writeDisplay(VSFrameRef *dst, int n, int field) const;

public:
  const VSVideoInfo *vi;

  const VSFrameRef *GetFrame(int n, int activationReason, VSFrameContext *frameCtx, VSCore *core);
  TFMPP(VSNodeRef *_child, int _PP, int _mthresh, const char* _ovr, bool _display, VSNodeRef *_clip2,
    bool _usehints, int _opt, const VSAPI *_vsapi, VSCore *core);
  ~TFMPP();
};
07070100000015000081A4000000000000000000000001671240C900004546000000000000000000000000000000000000003100000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMPlanar.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <cstring>
#include "TFM.h"
#include "TFMasm.h"
#include "TCommonASM.h"
#include <algorithm>


template<int planarType>
void FillCombedPlanarUpdateCmaskByUV(VSFrameRef* cmask, const VSAPI *vsapi)
{
  uint8_t* cmkp = vsapi->getWritePtr(cmask, 0);
  uint8_t* cmkpU = vsapi->getWritePtr(cmask, 1);
  uint8_t* cmkpV = vsapi->getWritePtr(cmask, 2);
  const int Width = vsapi->getFrameWidth(cmask, 2); // chroma!
  const int Height = vsapi->getFrameHeight(cmask, 2);
  const int cmk_pitch = vsapi->getStride(cmask, 0);
  const int cmk_pitchUV = vsapi->getStride(cmask, 2);
  do_FillCombedPlanarUpdateCmaskByUV<planarType>(cmkp, cmkpU, cmkpV, Width, Height, cmk_pitch, cmk_pitchUV);
}

// templatize
template void FillCombedPlanarUpdateCmaskByUV<411>(VSFrameRef* cmask, const VSAPI *vsapi);
template void FillCombedPlanarUpdateCmaskByUV<420>(VSFrameRef* cmask, const VSAPI *vsapi);
template void FillCombedPlanarUpdateCmaskByUV<422>(VSFrameRef* cmask, const VSAPI *vsapi);
template void FillCombedPlanarUpdateCmaskByUV<444>(VSFrameRef* cmask, const VSAPI *vsapi);

//FIXME: once to make it common with TDeInterlace::CheckedCombedPlanar
//similar, but cmask is real PVideoFrame there
template<typename pixel_t>
void checkCombedPlanarAnalyze_core(const VSVideoInfo *vi, int cthresh, bool chroma, const CPUFeatures *cpuFlags, int metric, const VSFrameRef *src, VSFrameRef* cmask, const VSAPI *vsapi)
{
  const int bits_per_pixel = vi->format->bitsPerSample;

  const bool use_sse2 = cpuFlags->sse2;
  const bool use_sse4 = cpuFlags->sse4_1;
  // cthresh: Area combing threshold used for combed frame detection.
  // This essentially controls how "strong" or "visible" combing must be to be detected.
  // Good values are from 6 to 12. If you know your source has a lot of combed frames set 
  // this towards the low end(6 - 7). If you know your source has very few combed frames set 
  // this higher(10 - 12). Going much lower than 5 to 6 or much higher than 12 is not recommended.

  const int scaled_cthresh = cthresh << (bits_per_pixel - 8);

  const int cthresh6 = scaled_cthresh * 6;

  const int np = vi->format->numPlanes;
  const int stop = chroma ? np : 1;

  for (int b = 0; b < stop; ++b)
  {
    const int plane = b;

    const pixel_t* srcp = reinterpret_cast<const pixel_t*>(vsapi->getReadPtr(src, plane));
    const int src_pitch = vsapi->getStride(src, plane) / sizeof(pixel_t);

    const int Width = vsapi->getFrameWidth(src, plane);
    const int Height = vsapi->getFrameHeight(src, plane);

    const pixel_t* srcpp = srcp - src_pitch;
    const pixel_t* srcppp = srcpp - src_pitch;
    const pixel_t* srcpn = srcp + src_pitch;
    const pixel_t* srcpnn = srcpn + src_pitch;

    uint8_t* cmkp = vsapi->getWritePtr(cmask, b);
    const int cmk_pitch = vsapi->getStride(cmask, b);

    if (scaled_cthresh < 0) {
      memset(cmkp, 255, Height * cmk_pitch); // mask. Always 8 bits 
      continue;
    }
    memset(cmkp, 0, Height * cmk_pitch);

    if (metric == 0)
    {
      // top 1 
      for (int x = 0; x < Width; ++x)
      {
        const int sFirst = srcp[x] - srcpn[x];
        if (sFirst > scaled_cthresh || sFirst < -scaled_cthresh)
        {
          if (abs(srcpnn[x] + (srcp[x] << 2) + srcpnn[x] - (3 * (srcpn[x] + srcpn[x]))) > cthresh6)
            cmkp[x] = 0xFF;
        }
      }
      srcppp += src_pitch;
      srcpp += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      srcpnn += src_pitch;
      cmkp += cmk_pitch;
      // top #2
      for (int x = 0; x < Width; ++x)
      {
        const int sFirst = srcp[x] - srcpp[x];
        const int sSecond = srcp[x] - srcpn[x];
        if ((sFirst > scaled_cthresh && sSecond > scaled_cthresh) || (sFirst < -scaled_cthresh && sSecond < -scaled_cthresh))
        {
          if (abs(srcpnn[x] + (srcp[x] << 2) + srcpnn[x] - (3 * (srcpp[x] + srcpn[x]))) > cthresh6)
            cmkp[x] = 0xFF;
        }
      }
      srcppp += src_pitch;
      srcpp += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      srcpnn += src_pitch;
      cmkp += cmk_pitch;
      // middle Height - 4
      const int lines_to_process = Height - 4;
      if (use_sse2 && sizeof(pixel_t) == 1)
        check_combing_SSE2((const uint8_t*)srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, scaled_cthresh);
      else if (use_sse4 && sizeof(pixel_t) == 2)
        check_combing_uint16_SSE4((const uint16_t*)srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, scaled_cthresh);
      else
        check_combing_c<pixel_t>(srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, scaled_cthresh);
      srcppp += src_pitch * lines_to_process;
      srcpp += src_pitch * lines_to_process;
      srcp += src_pitch * lines_to_process;
      srcpn += src_pitch * lines_to_process;
      srcpnn += src_pitch * lines_to_process;
      cmkp += cmk_pitch * lines_to_process;
      // bottom #-2
      for (int x = 0; x < Width; ++x)
      {
        const int sFirst = srcp[x] - srcpp[x];
        const int sSecond = srcp[x] - srcpn[x];
        if ((sFirst > scaled_cthresh && sSecond > scaled_cthresh) || (sFirst < -scaled_cthresh && sSecond < -scaled_cthresh))
        {
          if (abs(srcppp[x] + (srcp[x] << 2) + srcppp[x] - (3 * (srcpp[x] + srcpn[x]))) > cthresh6)
            cmkp[x] = 0xFF;
        }
      }
      srcppp += src_pitch;
      srcpp += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      srcpnn += src_pitch;
      cmkp += cmk_pitch;
      // bottom #-1
      for (int x = 0; x < Width; ++x)
      {
        const int sFirst = srcp[x] - srcpp[x];
        if (sFirst > scaled_cthresh || sFirst < -scaled_cthresh)
        {
          if (abs(srcppp[x] + (srcp[x] << 2) + srcppp[x] - (3 * (srcpp[x] + srcpp[x]))) > cthresh6)
            cmkp[x] = 0xFF;
        }
      }
    }
    else
    {
      // metric == 1: squared
      typedef typename std::conditional<sizeof(pixel_t) == 1, int, int64_t> ::type safeint_t;
      const safeint_t cthreshsq = (safeint_t)scaled_cthresh * scaled_cthresh;
      // top #1
      for (int x = 0; x < Width; ++x)
      {
        if ((safeint_t)(srcp[x] - srcpn[x]) * (srcp[x] - srcpn[x]) > cthreshsq)
          cmkp[x] = 0xFF;
      }
      srcpp += src_pitch;
      srcp += src_pitch;
      srcpn += src_pitch;
      cmkp += cmk_pitch;
      // middle Height - 2
      const int lines_to_process = Height - 2;
      if (use_sse2)
      {
        if constexpr (sizeof(pixel_t) == 1)
          check_combing_SSE2_Metric1(srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, cthreshsq);
        else
          check_combing_c_Metric1<pixel_t, safeint_t>(srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, cthreshsq);
        // fixme: write SIMD? later. int64 inside.
        // check_combing_uint16_SSE2_Metric1(srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, cthreshsq);
      }
      else
      {
        check_combing_c_Metric1<pixel_t, safeint_t>(srcp, cmkp, Width, lines_to_process, src_pitch, cmk_pitch, cthreshsq);
      }
      srcpp += src_pitch * lines_to_process;
      srcp += src_pitch * lines_to_process;
      srcpn += src_pitch * lines_to_process;
      cmkp += cmk_pitch * lines_to_process;
      // Bottom
      for (int x = 0; x < Width; ++x)
      {
        if ((safeint_t)(srcp[x] - srcpp[x]) * (srcp[x] - srcpp[x]) > cthreshsq)
          cmkp[x] = 0xFF;
      }
    }
  }

  // next block is for mask, no hbd needed
  // Includes chroma combing in the decision about whether a frame is combed.
  if (chroma)
  {
    if (vi->format->subSamplingW == 1 && vi->format->subSamplingH == 1) FillCombedPlanarUpdateCmaskByUV<420>(cmask, vsapi);
    else if (vi->format->subSamplingW == 1 && vi->format->subSamplingH == 0) FillCombedPlanarUpdateCmaskByUV<422>(cmask, vsapi);
    else if (vi->format->subSamplingW == 0 && vi->format->subSamplingH == 0) FillCombedPlanarUpdateCmaskByUV<444>(cmask, vsapi);
    else if (vi->format->subSamplingW == 2 && vi->format->subSamplingH == 0) FillCombedPlanarUpdateCmaskByUV<411>(cmask, vsapi);
  }
  // till now now it's the same as in TFMPlanar::checkCombedPlanar
}

// instantiate
template void checkCombedPlanarAnalyze_core<uint8_t>(const VSVideoInfo *vi, int cthresh, bool chroma, const CPUFeatures *cpuFlags, int metric, const VSFrameRef *src, VSFrameRef* cmask, const VSAPI *vsapi);
template void checkCombedPlanarAnalyze_core<uint16_t>(const VSVideoInfo *vi, int cthresh, bool chroma, const CPUFeatures *cpuFlags, int metric, const VSFrameRef *src, VSFrameRef* cmask, const VSAPI *vsapi);


bool TFM::checkCombedPlanar(const VSFrameRef *src, int n, int match,
  int *blockN, int &xblocksi, int *mics, bool ddebug, bool _chroma)
{
  if (mics[match] != -20)
  {
    if (mics[match] > MI)
    {
//      if (debug && !ddebug)
//      {
//        sprintf(buf, "TFM:  frame %d  - match %c:  Detected As Combed  (ReCheck - not processed)! (%d > %d)\n",
//          n, MTC(match), mics[match], MI);
//        OutputDebugString(buf);
//      }
      return true;
    }
//    if (debug && !ddebug)
//    {
//      sprintf(buf, "TFM:  frame %d  - match %c:  Detected As NOT Combed  (ReCheck - not processed)! (%d <= %d)\n",
//        n, MTC(match), mics[match], MI);
//      OutputDebugString(buf);
//    }
    return false;
  }

  const int bits_per_pixel = vi->format->bitsPerSample;
  if (vi->format->bytesPerSample == 1) {
    checkCombedPlanarAnalyze_core<uint8_t>(vi, cthresh, _chroma, &cpuFlags, metric, src, cmask.get(), vsapi);
    return checkCombedPlanar_core<uint8_t>(src, n, match, blockN, xblocksi, mics, ddebug, bits_per_pixel);
  }
  else {
    checkCombedPlanarAnalyze_core<uint16_t>(vi, cthresh, _chroma, &cpuFlags, metric, src, cmask.get(), vsapi);
    return checkCombedPlanar_core<uint16_t>(src, n, match, blockN, xblocksi, mics, ddebug, bits_per_pixel);
  }
}

template<typename pixel_t>
bool TFM::checkCombedPlanar_core(const VSFrameRef *src, int n, int match,
  int* blockN, int& xblocksi, int* mics, bool ddebug, int bits_per_pixel)
{
    (void)src;
    (void)n;
    (void)ddebug;
    (void)bits_per_pixel;

  const bool use_sse2 = cpuFlags.sse2;

  const int cmk_pitch = vsapi->getStride(cmask.get(), 0);
  const uint8_t *cmkp = vsapi->getWritePtr(cmask.get(), 0) + cmk_pitch;
  const uint8_t *cmkpp = cmkp - cmk_pitch;
  const uint8_t *cmkpn = cmkp + cmk_pitch;
  const int Width = vsapi->getFrameWidth(cmask.get(), 0);
  const int Height = vsapi->getFrameHeight(cmask.get(), 0);
  const int xblocks = ((Width + xhalf) >> xshift) + 1;
  const int xblocks4 = xblocks << 2;
  xblocksi = xblocks4;
  const int yblocks = ((Height + yhalf) >> yshift) + 1;
  const int arraysize = (xblocks*yblocks) << 2;
  memset(cArray.get(), 0, arraysize * sizeof(int));

  int Heighta = (Height >> (yshift - 1)) << (yshift - 1);
  if (Heighta == Height) Heighta = Height - yhalf;
  const int Widtha = (Width >> (xshift - 1)) << (xshift - 1);
  const bool use_sse2_sum = (use_sse2 && xhalf == 8 && yhalf == 8) ? true : false; // 8x8: no alignment
  for (int y = 1; y < yhalf; ++y)
  {
    const int temp1 = (y >> yshift)*xblocks4;
    const int temp2 = ((y + yhalf) >> yshift)*xblocks4;
    for (int x = 0; x < Width; ++x)
    {
      if (cmkpp[x] == 0xFF && cmkp[x] == 0xFF && cmkpn[x] == 0xFF)
      {
        const int box1 = (x >> xshift) << 2;
        const int box2 = ((x + xhalf) >> xshift) << 2;
        ++cArray.get()[temp1 + box1 + 0];
        ++cArray.get()[temp1 + box2 + 1];
        ++cArray.get()[temp2 + box1 + 2];
        ++cArray.get()[temp2 + box2 + 3];
      }
    }
    cmkpp += cmk_pitch;
    cmkp += cmk_pitch;
    cmkpn += cmk_pitch;
  }
  for (int y = yhalf; y < Heighta; y += yhalf)
  {
    const int temp1 = (y >> yshift)*xblocks4;
    const int temp2 = ((y + yhalf) >> yshift)*xblocks4;
    if (use_sse2_sum)
    {
      for (int x = 0; x < Widtha; x += xhalf)
      {
        int sum = 0;
        compute_sum_8xN_sse2<8>(cmkpp + x, cmk_pitch, sum);
        if (sum)
        {
          const int box1 = (x >> xshift) << 2;
          const int box2 = ((x + xhalf) >> xshift) << 2;
          cArray.get()[temp1 + box1 + 0] += sum;
          cArray.get()[temp1 + box2 + 1] += sum;
          cArray.get()[temp2 + box1 + 2] += sum;
          cArray.get()[temp2 + box2 + 3] += sum;
        }
      }
    }
    else
    {
      for (int x = 0; x < Widtha; x += xhalf)
      {
        const uint8_t *cmkppT = cmkpp;
        const uint8_t *cmkpT = cmkp;
        const uint8_t *cmkpnT = cmkpn;
        int sum = 0;
        for (int u = 0; u < yhalf; ++u)
        {
          for (int v = 0; v < xhalf; ++v)
          {
            if (cmkppT[x + v] == 0xFF && cmkpT[x + v] == 0xFF &&
              cmkpnT[x + v] == 0xFF) ++sum;
          }
          cmkppT += cmk_pitch;
          cmkpT += cmk_pitch;
          cmkpnT += cmk_pitch;
        }
        if (sum)
        {
          const int box1 = (x >> xshift) << 2;
          const int box2 = ((x + xhalf) >> xshift) << 2;
          cArray.get()[temp1 + box1 + 0] += sum;
          cArray.get()[temp1 + box2 + 1] += sum;
          cArray.get()[temp2 + box1 + 2] += sum;
          cArray.get()[temp2 + box2 + 3] += sum;
        }
      }
    }
    // rest
    for (int x = Widtha; x < Width; ++x)
    {
      const uint8_t *cmkppT = cmkpp;
      const uint8_t *cmkpT = cmkp;
      const uint8_t *cmkpnT = cmkpn;
      int sum = 0;
      for (int u = 0; u < yhalf; ++u)
      {
        if (cmkppT[x] == 0xFF && cmkpT[x] == 0xFF &&
          cmkpnT[x] == 0xFF) ++sum;
        cmkppT += cmk_pitch;
        cmkpT += cmk_pitch;
        cmkpnT += cmk_pitch;
      }
      if (sum)
      {
        const int box1 = (x >> xshift) << 2;
        const int box2 = ((x + xhalf) >> xshift) << 2;
        cArray.get()[temp1 + box1 + 0] += sum;
        cArray.get()[temp1 + box2 + 1] += sum;
        cArray.get()[temp2 + box1 + 2] += sum;
        cArray.get()[temp2 + box2 + 3] += sum;
      }
    }
    cmkpp += cmk_pitch*yhalf;
    cmkp += cmk_pitch*yhalf;
    cmkpn += cmk_pitch*yhalf;
  }
  for (int y = Heighta; y < Height - 1; ++y)
  {
    const int temp1 = (y >> yshift)*xblocks4;
    const int temp2 = ((y + yhalf) >> yshift)*xblocks4;
    for (int x = 0; x < Width; ++x)
    {
      if (cmkpp[x] == 0xFF && cmkp[x] == 0xFF && cmkpn[x] == 0xFF)
      {
        const int box1 = (x >> xshift) << 2;
        const int box2 = ((x + xhalf) >> xshift) << 2;
        ++cArray.get()[temp1 + box1 + 0];
        ++cArray.get()[temp1 + box2 + 1];
        ++cArray.get()[temp2 + box1 + 2];
        ++cArray.get()[temp2 + box2 + 3];
      }
    }
    cmkpp += cmk_pitch;
    cmkp += cmk_pitch;
    cmkpn += cmk_pitch;
  }
  for (int x = 0; x < arraysize; ++x)
  {
    if (cArray.get()[x] > mics[match])
    {
      mics[match] = cArray.get()[x];
      blockN[match] = x;
    }
  }
  if (mics[match] > MI)
  {
//    if (debug && !ddebug)
//    {
//      sprintf(buf, "TFM:  frame %d  - match %c:  Detected As Combed! (%d > %d)\n",
//        n, MTC(match), mics[match], MI);
//      OutputDebugString(buf);
//    }
    return true;
  }
//  if (debug && !ddebug)
//  {
//    sprintf(buf, "TFM:  frame %d  - match %c:  Detected As NOT Combed! (%d <= %d)\n",
//      n, MTC(match), mics[match], MI);
//    OutputDebugString(buf);
//  }
  return false;
}

template<typename pixel_t>
void TFM::buildDiffMapPlane_Planar(const uint8_t *prvp, const uint8_t *nxtp,
  uint8_t *dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int tpitch, int bits_per_pixel)
{
  buildABSDiffMask<pixel_t>(prvp - prv_pitch, nxtp - nxt_pitch, prv_pitch, nxt_pitch, tpitch, Width, Height >> 1);
  switch (bits_per_pixel) {
  case 8: AnalyzeDiffMask_Planar<uint8_t, 8>(dstp, dst_pitch, tbuffer.get(), tpitch, Width, Height); break;
  case 10: AnalyzeDiffMask_Planar<uint16_t, 10>(dstp, dst_pitch, tbuffer.get(), tpitch, Width, Height); break;
  case 12: AnalyzeDiffMask_Planar<uint16_t, 12>(dstp, dst_pitch, tbuffer.get(), tpitch, Width, Height); break;
  case 14: AnalyzeDiffMask_Planar<uint16_t, 14>(dstp, dst_pitch, tbuffer.get(), tpitch, Width, Height); break;
  case 16: AnalyzeDiffMask_Planar<uint16_t, 16>(dstp, dst_pitch, tbuffer.get(), tpitch, Width, Height); break;
  }
}

// instantiate
template void TFM::buildDiffMapPlane_Planar<uint8_t>(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int tpitch, int bits_per_pixel);
template void TFM::buildDiffMapPlane_Planar<uint16_t>(const uint8_t* prvp, const uint8_t* nxtp,
  uint8_t* dstp, int prv_pitch, int nxt_pitch, int dst_pitch, int Height,
  int Width, int tpitch, int bits_per_pixel);


07070100000016000081A4000000000000000000000001671240C900000784000000000000000000000000000000000000002C00000000vapoursynth-tivtc-2+2.g7abd4a3/src/TFMasm.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
#ifndef __TFMASM_H__
#define __TFMASM_H__

#include <cstdint>
#include "internal.h"

void checkSceneChangeYUY2_1_SSE2(const uint8_t* prvp, const uint8_t* srcp,
  int height, int width, int prv_pitch, int src_pitch, uint64_t& diffp);
void checkSceneChangeYUY2_2_SSE2(const uint8_t* prvp, const uint8_t* srcp,
  const uint8_t* nxtp, int height, int width, int prv_pitch, int src_pitch,
  int nxt_pitch, uint64_t& diffp, uint64_t& diffn);

void checkSceneChangePlanar_1_SSE2(const uint8_t* prvp, const uint8_t* srcp,
  int height, int width, int prv_pitch, int src_pitch, uint64_t& diffp);
void checkSceneChangePlanar_2_SSE2(const uint8_t* prvp, const uint8_t* srcp,
  const uint8_t* nxtp, int height, int width, int prv_pitch, int src_pitch,
  int nxt_pitch, uint64_t& diffp, uint64_t& diffn);

#endif // TFMASM_H__
07070100000017000081A4000000000000000000000001671240C9000014AC000000000000000000000000000000000000002F00000000vapoursynth-tivtc-2+2.g7abd4a3/src/calcCRC.cpp/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

//#include "internal.h"
#include "calcCRC.h"

static const unsigned int Crc32Table[256] =
{
  0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
  0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
  0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
  0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
  0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
  0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
  0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
  0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
  0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
  0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
  0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
  0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
  0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
  0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
  0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
  0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
  0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
  0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
  0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
  0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
  0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
  0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
  0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
  0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
  0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
  0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
  0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
  0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
  0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
  0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
  0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
  0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
  0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
  0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
  0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
  0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
  0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
  0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
  0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
  0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
  0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
  0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
  0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
  0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
  0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
  0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
  0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
  0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
  0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
  0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
  0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
  0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
  0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
  0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
  0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
  0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
  0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
  0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
  0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
  0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
  0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
  0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
  0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
  0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D,
};

void calcCRC(VSNodeRef *hclip, int stop, unsigned int &crc, const VSAPI *vsapi)
{
  crc = 0xFFFFFFFF;
  const VSFrameRef *src;
  const unsigned int *ptrCrcTable = Crc32Table;
  const uint8_t *buffer;
  int width, height, pitch, modulo, x;
  const VSVideoInfo *vi2 = vsapi->getVideoInfo(hclip);
  if (stop > vi2->numFrames) stop = vi2->numFrames;
  for (x = 0; x < stop; ++x)
  {
    src = vsapi->getFrame(x, hclip, nullptr, 0);
    buffer = vsapi->getReadPtr(src, 0);
    width = vsapi->getFrameWidth(src, 0) * vsapi->getFrameFormat(src)->bytesPerSample;
    pitch = vsapi->getStride(src, 0);
    height = vsapi->getFrameHeight(src, 0);
    modulo = pitch - width;
    while (height--) {
      int size = width;
      while (size--)
        crc = ptrCrcTable[(crc ^ *buffer++) & 0xFF] ^ (crc >> 8);
      buffer += modulo;
    }
    //crc = crc ^ ~0U;
    vsapi->freeFrame(src);
  }
}
07070100000018000081A4000000000000000000000001671240C9000004F1000000000000000000000000000000000000002D00000000vapoursynth-tivtc-2+2.g7abd4a3/src/calcCRC.h/*
**                    TIVTC for AviSynth 2.6 interface
**
**   TIVTC includes a field matching filter (TFM) and a decimation
**   filter (TDecimate) which can be used together to achieve an
**   IVTC or for other uses. TIVTC currently supports 8 bit planar YUV and
**   YUY2 colorspaces.
**
**   Copyright (C) 2004-2008 Kevin Stone, additional work (C) 2020 pinterf
**
**   This program is free software; you can redistribute it and/or modify
**   it under the terms of the GNU General Public License as published by
**   the Free Software Foundation; either version 2 of the License, or
**   (at your option) any later version.
**
**   This program is distributed in the hope that it will be useful,
**   but WITHOUT ANY WARRANTY; without even the implied warranty of
**   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**   GNU General Public License for more details.
**
**   You should have received a copy of the GNU General Public License
**   along with this program; if not, write to the Free Software
**   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

//#include <windows.h>
//#include "internal.h"
#include <VapourSynth.h>

void calcCRC(VSNodeRef *hclip, int stop, unsigned int& crc, const VSAPI *vsapi);
07070100000019000081A4000000000000000000000001671240C900000D7C000000000000000000000000000000000000003300000000vapoursynth-tivtc-2+2.g7abd4a3/src/cpufeatures.cpp/*
* Copyright (c) 2012-2019 Fredrik Mellbin
*
* This file is part of VapourSynth.
*
* VapourSynth is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* VapourSynth is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with VapourSynth; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include <string.h>

#include "cpufeatures.h"

#ifdef VS_TARGET_CPU_X86

#ifdef _MSC_VER
#include <intrin.h>
#else
#include <cpuid.h>
#endif

static void vs_cpu_cpuid(int index, int* eax, int* ebx, int* ecx, int* edx) {
    *eax = 0;
    *ebx = 0;
    *ecx = 0;
    *edx = 0;
#ifdef _MSC_VER
    int regs[4];
    __cpuidex(regs, index, 0);
    *eax = regs[0];
    *ebx = regs[1];
    *ecx = regs[2];
    *edx = regs[3];
#elif defined(__GNUC__)
    __cpuid_count(index, 0, *eax, *ebx, *ecx, *edx);
#else
#error "Unknown compiler, can't get cpuid"
#endif
}

static unsigned long long vs_cpu_xgetbv(unsigned ecx) {
#if defined(_MSC_VER)
    return _xgetbv(ecx);
#elif defined(__GNUC__)
    unsigned eax, edx;
    __asm("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx) : );
    return (((unsigned long long)edx) << 32) | eax;
#else
    return 0;
#endif
}

static void doGetCPUFeatures(CPUFeatures *cpuFeatures) {
    memset(cpuFeatures, 0, sizeof(CPUFeatures));

    int eax = 0;
    int ebx = 0;
    int ecx = 0;
    int edx = 0;
    long long xedxeax = 0;
    vs_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
    cpuFeatures->sse2 = !!(edx & (1 << 26)); //sse2
    cpuFeatures->sse3 = !!(ecx & 1);
    cpuFeatures->ssse3 = !!(ecx & (1 << 9));
    cpuFeatures->sse4_1 = !!(ecx & (1 << 19));
    cpuFeatures->sse4_2 = !!(ecx & (1 << 20));
    cpuFeatures->fma3 = !!(ecx & (1 << 12));
    cpuFeatures->f16c = !!(ecx & (1 << 29));
    cpuFeatures->aes = !!(ecx & (1 << 25));
    cpuFeatures->movbe = !!(ecx & (1 << 22));
    cpuFeatures->popcnt = !!(ecx & (1 << 23));

    if ((ecx & (1 << 27)) && (ecx & (1 << 28))) {
        xedxeax = vs_cpu_xgetbv(0);
        cpuFeatures->avx = ((xedxeax & 0x06) == 0x06);
        if (cpuFeatures->avx) {
            eax = 0;
            ebx = 0;
            ecx = 0;
            edx = 0;
            vs_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
            cpuFeatures->avx2 = !!(ebx & (1 << 5));
            cpuFeatures->avx512_f = !!(ebx & (1 << 16)) && ((xedxeax & 0xE0) == 0xE0);

            if (cpuFeatures->avx512_f) {
                cpuFeatures->avx512_cd = !!(ebx & (1 << 28));
                cpuFeatures->avx512_bw = !!(ebx & (1 << 30));
                cpuFeatures->avx512_dq = !!(ebx & (1 << 17));
                cpuFeatures->avx512_vl = !!(ebx & (1 << 31));
            }
        }
    }
}
#else
static void doGetCPUFeatures(CPUFeatures *cpuFeatures) {
    memset(cpuFeatures, 0, sizeof(CPUFeatures));
}
#endif

const CPUFeatures *getCPUFeatures(void) {
    static CPUFeatures features = []()
    {
        CPUFeatures tmp;
        doGetCPUFeatures(&tmp);
        return tmp;
    }();

    return &features;
}
0707010000001A000081A4000000000000000000000001671240C900000539000000000000000000000000000000000000003100000000vapoursynth-tivtc-2+2.g7abd4a3/src/cpufeatures.h/*
* Copyright (c) 2012-2017 Fredrik Mellbin
*
* This file is part of VapourSynth.
*
* VapourSynth is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* VapourSynth is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with VapourSynth; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#ifndef CPUFEATURES_H
#define CPUFEATURES_H

#ifdef __cplusplus
extern "C" {
#endif

typedef struct CPUFeatures {
#ifdef VS_TARGET_CPU_X86
    char sse2;
    char sse3;
    char ssse3;
    char sse4_1;
    char sse4_2;
    char fma3;
    char avx;
    char avx2;
    char f16c;
    char aes;
    char movbe;
    char popcnt;
    char avx512_f;
    char avx512_cd;
    char avx512_bw;
    char avx512_dq;
    char avx512_vl;
#endif
} CPUFeatures;

const CPUFeatures *getCPUFeatures(void);

#ifdef __cplusplus
}
#endif

#endif
0707010000001B000081A4000000000000000000000001671240C900000C15000000000000000000000000000000000000002E00000000vapoursynth-tivtc-2+2.g7abd4a3/src/internal.h#ifndef __Internal_H__
#define __Internal_H__

#include <stdexcept>
#include <cstring>
#include <cstdio>

#ifdef _WIN32
#include <windows.h>
#endif

// these settings control whether the included code comes from old asm or newer simd/C rewrites
#define USE_C_NO_ASM
// USE_C_NO_ASM: inline non-simd asm


#ifdef _WIN32
#define AVS_FORCEINLINE __forceinline
#else
#define AVS_FORCEINLINE __attribute__((always_inline)) inline
#endif


// Frame properties set by TFM:
#define PROP_TFMDisplay "TFMDisplay"
#define PROP_TFMMATCH "TFMMatch"
#define PROP_TFMMics "TFMMics"
#define PROP_Combed "_Combed"
#define PROP_TFMD2VFilm "TFMD2VFilm"
#define PROP_TFMField "TFMField"
#define PROP_TFMPP "TFMPP"

// Frame properties set by TDecimate:
#define PROP_TDecimateDisplay "TDecimateDisplay"
#define PROP_TDecimateCycleStart "TDecimateCycleStart"
#define PROP_TDecimateCycleMaxBlockDiff "TDecimateCycleMaxBlockDiff" // uint64_t[]
#define PROP_TDecimateOriginalFrame "TDecimateOriginalFrame"
#define PROP_DurationNum "_DurationNum"
#define PROP_DurationDen "_DurationDen"

class TIVTCError : public std::runtime_error {
    using std::runtime_error::runtime_error;
};


constexpr int ISP = 0x00000000; // p
constexpr int ISC = 0x00000001; // c
constexpr int ISN = 0x00000002; // n
constexpr int ISB = 0x00000003; // b
constexpr int ISU = 0x00000004; // u
constexpr int ISDB = 0x00000005; // l = (deinterlaced c bottom field)
constexpr int ISDT = 0x00000006; // h = (deinterlaced c top field)

#define MTC(n) n == 0 ? 'p' : n == 1 ? 'c' : n == 2 ? 'n' : n == 3 ? 'b' : n == 4 ? 'u' : \
               n == 5 ? 'l' : n == 6 ? 'h' : 'x'

constexpr int TOP_FIELD = 0x00000008;
constexpr int COMBED = 0x00000010;
constexpr int D2VFILM = 0x00000020;

constexpr int FILE_COMBED = 0x00000030;
constexpr int FILE_NOTCOMBED = 0x00000020;
constexpr int FILE_ENTRY = 0x00000080;
constexpr int FILE_D2V = 0x00000008;
constexpr int D2VARRAY_DUP_MASK = 0x03;
constexpr int D2VARRAY_MATCH_MASK = 0x3C;

constexpr int DROP_FRAME = 0x00000001; // ovr array - bit 1
constexpr int KEEP_FRAME = 0x00000002; // ovr array - 2
constexpr int FILM = 0x00000004; // ovr array - bit 3
constexpr int VIDEO = 0x00000008; // ovr array - bit 4
constexpr int ISMATCH = 0x00000070; // ovr array - bits 5-7
constexpr int ISD2VFILM = 0x00000080; // ovr array - bit 8

#define cfps(n) n == 1 ? "119.880120" : n == 2 ? "59.940060" : n == 3 ? "39.960040" : \
                n == 4 ? "29.970030" : n == 5 ? "23.976024" : "unknown"


#ifdef VERSION
#undef VERSION
#endif
#define VERSION "v1.0.7"


static FILE *tivtc_fopen(const char *name, const char *mode) {
#ifdef _WIN32
    int len = MultiByteToWideChar(CP_UTF8, 0, name, -1, nullptr, 0);
    std::wstring wname(len, 0);

    int ret = MultiByteToWideChar(CP_UTF8, 0, name, -1, wname.data(), len);
    if (ret == len) {
        std::wstring wmode(mode, mode + strlen(mode));
        return _wfopen(wname.c_str(), wmode.c_str());
    } else
        throw TIVTCError("Failed to convert file name to wide char.");
#else
    return std::fopen(name, mode);
#endif
}


#endif  // __Internal_H__
07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000B00000000TRAILER!!!1234 blocks
Places

File vapoursynth-tivtc-2+2.g7abd4a3.obscpio of Package vapoursynth-plugin-tivtc

Places