LogoopenSUSE Build Service > Projects
Sign Up | Log In

View File ffmpeg_NVIDIA_gpu_acceleration.patch of Package ffmpeg-3.0 (Project home:linux4humans:rebuilds_of_some_packman_apps)

diff --git a/configure b/configure
index d5e76de..2b3e61c 100755
--- a/configure
+++ b/configure
@@ -277,6 +277,7 @@
   --enable-mmal            enable decoding via MMAL [no]
   --enable-netcdf          enable NetCDF, needed for sofalizer filter [no]
   --enable-nvenc           enable NVIDIA NVENC support [no]
+  --enable-nvresize        enable NVIDIA CUDA accelerated resizer [no]
   --enable-openal          enable OpenAL 1.1 capture support [no]
   --enable-opencl          enable OpenCL code
   --enable-opengl          enable OpenGL rendering [no]
@@ -1517,6 +1518,7 @@
     mmal
     netcdf
     nvenc
+    nvresize
     openal
     opencl
     opengl
@@ -5214,7 +5216,8 @@
 frei0r_filter_extralibs='$ldl'
 frei0r_src_filter_extralibs='$ldl'
 ladspa_filter_extralibs='$ldl'
-nvenc_encoder_extralibs='$ldl'
+nvenc_encoder_extralibs='$ldl $lcudautils'
+nvresize_filter_extralibs='$ldl $lcudautils'
 
 if ! disabled network; then
     check_func getaddrinfo $network_extralibs
@@ -5567,9 +5570,11 @@
 
 enabled netcdf            && require_pkg_config netcdf netcdf.h nc_inq_libvers
 enabled nvenc             && { check_header nvEncodeAPI.h || die "ERROR: nvEncodeAPI.h not found."; } &&
+                             { check_lib 'cudautils.h' init_cuda -lcudautils -ldl || die "ERROR: libcudautils not found."; } &&
                              { check_cpp_condition nvEncodeAPI.h "NVENCAPI_MAJOR_VERSION >= 5" ||
                                die "ERROR: NVENC API version 4 or older is not supported"; } &&
                              { [ $target_os != cygwin ] || die "ERROR: NVENC is not supported on Cygwin currently."; }
+enabled nvresize          && { check_lib 'cudautils.h' init_cuda -lcudautils -ldl || die "ERROR: libcudautils not found."; } &&
 enabled openal            && { { for al_libs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
                                check_lib 'AL/al.h' alGetError "${al_libs}" && break; done } ||
                                die "ERROR: openal not found"; } &&
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 68a573f..3a6a857 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -98,7 +98,7 @@ OBJS-$(CONFIG_MPEGVIDEOENC)            += mpegvideo_enc.o mpeg12data.o  \
                                           motion_est.o ratecontrol.o    \
                                           mpegvideoencdsp.o
 OBJS-$(CONFIG_MSS34DSP)                += mss34dsp.o
-OBJS-$(CONFIG_NVENC)                   += nvenc.o
+OBJS-$(CONFIG_NVENC)                   += nvenc.o nvenc_ptx.o
 OBJS-$(CONFIG_PIXBLOCKDSP)             += pixblockdsp.o
 OBJS-$(CONFIG_QPELDSP)                 += qpeldsp.o
 OBJS-$(CONFIG_QSV)                     += qsv.o
diff --git a/libavcodec/nvenc.c b/libavcodec/nvenc.c
index 43b8e78..3a606f7 100644
--- a/libavcodec/nvenc.c
+++ b/libavcodec/nvenc.c
@@ -25,8 +25,11 @@
 #include <dlfcn.h>
 #endif
 
+/* External headers */
 #include <nvEncodeAPI.h>
+#include <cudautils.h>
 
+/* FFmpeg headers */
 #include "libavutil/internal.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avassert.h"
@@ -36,11 +39,6 @@
 #include "internal.h"
 #include "thread.h"
 
-#if defined(_WIN32)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
 
 #if defined(_WIN32)
 #define LOAD_FUNC(l, s) GetProcAddress(l, s)
@@ -50,28 +48,19 @@
 #define DL_CLOSE_FUNC(l) dlclose(l)
 #endif
 
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-typedef int CUdevice;
-typedef void* CUcontext;
-
-typedef CUresult(CUDAAPI *PCUINIT)(unsigned int Flags);
-typedef CUresult(CUDAAPI *PCUDEVICEGETCOUNT)(int *count);
-typedef CUresult(CUDAAPI *PCUDEVICEGET)(CUdevice *device, int ordinal);
-typedef CUresult(CUDAAPI *PCUDEVICEGETNAME)(char *name, int len, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUDEVICECOMPUTECAPABILITY)(int *major, int *minor, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXCREATE)(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult(CUDAAPI *PCUCTXPOPCURRENT)(CUcontext *pctx);
-typedef CUresult(CUDAAPI *PCUCTXDESTROY)(CUcontext ctx);
+#define BLOCKSX  128
+#define THREADSX 128
 
 typedef NVENCSTATUS (NVENCAPI* PNVENCODEAPICREATEINSTANCE)(NV_ENCODE_API_FUNCTION_LIST *functionList);
 
 typedef struct NvencInputSurface
 {
     NV_ENC_INPUT_PTR input_surface;
+    CUdeviceptr      dptr;
+    void*            hRes;
     int width;
     int height;
+    size_t pitch;
 
     int lockCount;
 
@@ -107,24 +96,11 @@ typedef struct NvencDataList
 
 typedef struct NvencDynLoadFunctions
 {
-    PCUINIT cu_init;
-    PCUDEVICEGETCOUNT cu_device_get_count;
-    PCUDEVICEGET cu_device_get;
-    PCUDEVICEGETNAME cu_device_get_name;
-    PCUDEVICECOMPUTECAPABILITY cu_device_compute_capability;
-    PCUCTXCREATE cu_ctx_create;
-    PCUCTXPOPCURRENT cu_ctx_pop_current;
-    PCUCTXDESTROY cu_ctx_destroy;
-
     NV_ENCODE_API_FUNCTION_LIST nvenc_funcs;
-    int nvenc_device_count;
-    CUdevice nvenc_devices[16];
 
 #if defined(_WIN32)
-    HMODULE cuda_lib;
     HMODULE nvenc_lib;
 #else
-    void* cuda_lib;
     void* nvenc_lib;
 #endif
 } NvencDynLoadFunctions;
@@ -140,14 +116,18 @@ typedef struct NvencContext
     AVClass *avclass;
 
     NvencDynLoadFunctions nvenc_dload_funcs;
+    CudaDynLoadFunctions* cuda_dload_funcs;
 
     NV_ENC_INITIALIZE_PARAMS init_encode_params;
     NV_ENC_CONFIG encode_config;
     CUcontext cu_context;
+    CUmodule  cu_module;
+    CUfunction cu_func_interleaveChroma;
 
     int max_surface_count;
     NvencInputSurface *input_surfaces;
     NvencOutputSurface *output_surfaces;
+    NvencInputSurface transferSurf;
 
     NvencDataList output_surface_queue;
     NvencDataList output_surface_ready_queue;
@@ -164,8 +144,10 @@ typedef struct NvencContext
     int twopass;
     int gpu;
     int buffer_delay;
+    int aq;
 } NvencContext;
 
+
 static const NvencValuePair nvenc_h264_level_pairs[] = {
     { "auto", NV_ENC_LEVEL_AUTOSELECT },
     { "1"   , NV_ENC_LEVEL_H264_1     },
@@ -330,79 +312,18 @@ static int64_t timestamp_queue_dequeue(NvencDataList* queue)
     return res->u.timestamp;
 }
 
-#define CHECK_LOAD_FUNC(t, f, s) \
-do { \
-    (f) = (t)LOAD_FUNC(dl_fn->cuda_lib, s); \
-    if (!(f)) { \
-        av_log(avctx, AV_LOG_FATAL, "Failed loading %s from CUDA library\n", s); \
-        goto error; \
-    } \
-} while (0)
-
-static av_cold int nvenc_dyload_cuda(AVCodecContext *avctx)
-{
-    NvencContext *ctx = avctx->priv_data;
-    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
-
-    if (dl_fn->cuda_lib)
-        return 1;
-
-#if defined(_WIN32)
-    dl_fn->cuda_lib = LoadLibrary(TEXT("nvcuda.dll"));
-#else
-    dl_fn->cuda_lib = dlopen("libcuda.so", RTLD_LAZY);
-#endif
-
-    if (!dl_fn->cuda_lib) {
-        av_log(avctx, AV_LOG_FATAL, "Failed loading CUDA library\n");
-        goto error;
-    }
-
-    CHECK_LOAD_FUNC(PCUINIT, dl_fn->cu_init, "cuInit");
-    CHECK_LOAD_FUNC(PCUDEVICEGETCOUNT, dl_fn->cu_device_get_count, "cuDeviceGetCount");
-    CHECK_LOAD_FUNC(PCUDEVICEGET, dl_fn->cu_device_get, "cuDeviceGet");
-    CHECK_LOAD_FUNC(PCUDEVICEGETNAME, dl_fn->cu_device_get_name, "cuDeviceGetName");
-    CHECK_LOAD_FUNC(PCUDEVICECOMPUTECAPABILITY, dl_fn->cu_device_compute_capability, "cuDeviceComputeCapability");
-    CHECK_LOAD_FUNC(PCUCTXCREATE, dl_fn->cu_ctx_create, "cuCtxCreate_v2");
-    CHECK_LOAD_FUNC(PCUCTXPOPCURRENT, dl_fn->cu_ctx_pop_current, "cuCtxPopCurrent_v2");
-    CHECK_LOAD_FUNC(PCUCTXDESTROY, dl_fn->cu_ctx_destroy, "cuCtxDestroy_v2");
-
-    return 1;
-
-error:
-
-    if (dl_fn->cuda_lib)
-        DL_CLOSE_FUNC(dl_fn->cuda_lib);
-
-    dl_fn->cuda_lib = NULL;
-
-    return 0;
-}
-
-static av_cold int check_cuda_errors(AVCodecContext *avctx, CUresult err, const char *func)
-{
-    if (err != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, ">> %s - failed with error code 0x%x\n", func, err);
-        return 0;
-    }
-    return 1;
-}
-#define check_cuda_errors(f) if (!check_cuda_errors(avctx, f, #f)) goto error
-
 static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
 {
-    int device_count = 0;
-    CUdevice cu_device = 0;
-    char gpu_name[128];
-    int smminor = 0, smmajor = 0;
-    int i, smver, target_smver;
 
+    int  target_smver;
     NvencContext *ctx = avctx->priv_data;
-    NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
+
+    if (!init_cuda())
+        return 0;
 
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
-        target_smver = avctx->pix_fmt == AV_PIX_FMT_YUV444P ? 0x52 : 0x30;
+        target_smver = 0x30;
         break;
     case AV_CODEC_ID_H265:
         target_smver = 0x52;
@@ -412,49 +333,19 @@ static av_cold int nvenc_check_cuda(AVCodecContext *avctx)
         goto error;
     }
 
-    if (!nvenc_dyload_cuda(avctx))
-        return 0;
-
-    if (dl_fn->nvenc_device_count > 0)
-        return 1;
-
-    check_cuda_errors(dl_fn->cu_init(0));
-
-    check_cuda_errors(dl_fn->cu_device_get_count(&device_count));
-
-    if (!device_count) {
-        av_log(avctx, AV_LOG_FATAL, "No CUDA capable devices found\n");
+    if (!is_gpu_feature_available(ctx->gpu, target_smver))
+    {
+        av_log(avctx, AV_LOG_FATAL, "NVENC with Codec %s Not Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264)? "H264" : "H265", ctx->gpu);
         goto error;
     }
-
-    av_log(avctx, AV_LOG_VERBOSE, "%d CUDA capable devices found\n", device_count);
-
-    dl_fn->nvenc_device_count = 0;
-
-    for (i = 0; i < device_count; ++i) {
-        check_cuda_errors(dl_fn->cu_device_get(&cu_device, i));
-        check_cuda_errors(dl_fn->cu_device_get_name(gpu_name, sizeof(gpu_name), cu_device));
-        check_cuda_errors(dl_fn->cu_device_compute_capability(&smmajor, &smminor, cu_device));
-
-        smver = (smmajor << 4) | smminor;
-
-        av_log(avctx, AV_LOG_VERBOSE, "[ GPU #%d - < %s > has Compute SM %d.%d, NVENC %s ]\n", i, gpu_name, smmajor, smminor, (smver >= target_smver) ? "Available" : "Not Available");
-
-        if (smver >= target_smver)
-            dl_fn->nvenc_devices[dl_fn->nvenc_device_count++] = cu_device;
-    }
-
-    if (!dl_fn->nvenc_device_count) {
-        av_log(avctx, AV_LOG_FATAL, "No NVENC capable devices found\n");
-        goto error;
+    else
+    {
+        av_log(avctx, AV_LOG_VERBOSE, "NVENC with Codec %s Available at requested GPU %d \n", (avctx->codec->id == AV_CODEC_ID_H264) ? "H264" : "H265", ctx->gpu);
     }
 
     return 1;
 
 error:
-
-    dl_fn->nvenc_device_count = 0;
-
     return 0;
 }
 
@@ -488,23 +379,18 @@ static av_cold int nvenc_dyload_nvenc(AVCodecContext *avctx)
     }
 
     nvEncodeAPICreateInstance = (PNVENCODEAPICREATEINSTANCE)LOAD_FUNC(dl_fn->nvenc_lib, "NvEncodeAPICreateInstance");
-
     if (!nvEncodeAPICreateInstance) {
         av_log(avctx, AV_LOG_FATAL, "Failed to load nvenc entrypoint\n");
         goto error;
     }
 
     dl_fn->nvenc_funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
-
     nvstatus = nvEncodeAPICreateInstance(&dl_fn->nvenc_funcs);
-
     if (nvstatus != NV_ENC_SUCCESS) {
         av_log(avctx, AV_LOG_FATAL, "Failed to create nvenc instance\n");
         goto error;
     }
 
-    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
-
     return 1;
 
 error:
@@ -512,7 +398,6 @@ error:
         DL_CLOSE_FUNC(dl_fn->nvenc_lib);
 
     dl_fn->nvenc_lib = NULL;
-
     return 0;
 }
 
@@ -523,29 +408,16 @@ static av_cold void nvenc_unload_nvenc(AVCodecContext *avctx)
 
     DL_CLOSE_FUNC(dl_fn->nvenc_lib);
     dl_fn->nvenc_lib = NULL;
-
-    dl_fn->nvenc_device_count = 0;
-
-    DL_CLOSE_FUNC(dl_fn->cuda_lib);
-    dl_fn->cuda_lib = NULL;
-
-    dl_fn->cu_init = NULL;
-    dl_fn->cu_device_get_count = NULL;
-    dl_fn->cu_device_get = NULL;
-    dl_fn->cu_device_get_name = NULL;
-    dl_fn->cu_device_compute_capability = NULL;
-    dl_fn->cu_ctx_create = NULL;
-    dl_fn->cu_ctx_pop_current = NULL;
-    dl_fn->cu_ctx_destroy = NULL;
-
+    deinit_cuda();
     av_log(avctx, AV_LOG_VERBOSE, "Nvenc unloaded\n");
 }
 
 static av_cold int nvenc_encode_init(AVCodecContext *avctx)
 {
     NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS encode_session_params = { 0 };
+    NV_ENC_REGISTER_RESOURCE registerParams = { 0 };
     NV_ENC_PRESET_CONFIG preset_config = { 0 };
-    CUcontext cu_context_curr;
+    CudaDynLoadFunctions *p_cuda;
     CUresult cu_res;
     GUID encoder_preset = NV_ENC_PRESET_HQ_GUID;
     GUID codec;
@@ -557,6 +429,7 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     int res = 0;
     int dw, dh;
     int qp_inter_p;
+    extern char color_ptx[];
 
     NvencContext *ctx = avctx->priv_data;
     NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
@@ -574,28 +447,18 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     encode_session_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
     encode_session_params.apiVersion = NVENCAPI_VERSION;
 
-    if (ctx->gpu >= dl_fn->nvenc_device_count) {
-        av_log(avctx, AV_LOG_FATAL, "Requested GPU %d, but only %d GPUs are available!\n", ctx->gpu, dl_fn->nvenc_device_count);
-        res = AVERROR(EINVAL);
-        goto error;
-    }
-
-    ctx->cu_context = NULL;
-    cu_res = dl_fn->cu_ctx_create(&ctx->cu_context, 4, dl_fn->nvenc_devices[ctx->gpu]); // CU_CTX_SCHED_BLOCKING_SYNC=4, avoid CPU spins
+    cu_res = get_cuda_context(&ctx->cu_context, ctx->gpu);
+    p_cuda = get_cuda_dl_func();
 
     if (cu_res != CUDA_SUCCESS) {
         av_log(avctx, AV_LOG_FATAL, "Failed creating CUDA context for NVENC: 0x%x\n", (int)cu_res);
         res = AVERROR_EXTERNAL;
         goto error;
     }
+	av_log(avctx, AV_LOG_VERBOSE, "NVENC : Cuda Context created 0x%x\n", (int)ctx->cu_context);
 
-    cu_res = dl_fn->cu_ctx_pop_current(&cu_context_curr);
-
-    if (cu_res != CUDA_SUCCESS) {
-        av_log(avctx, AV_LOG_FATAL, "Failed popping CUDA context: 0x%x\n", (int)cu_res);
-        res = AVERROR_EXTERNAL;
-        goto error;
-    }
+    __cu(p_cuda->cu_module_load_data(&ctx->cu_module, color_ptx));
+    __cu(p_cuda->cu_module_get_function(&ctx->cu_func_interleaveChroma, ctx->cu_module, "interleaveChroma"));
 
     encode_session_params.device = ctx->cu_context;
     encode_session_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
@@ -766,11 +629,8 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     if (ctx->encode_config.frameIntervalP >= 2)
         ctx->last_dts = -2;
 
-    if (avctx->bit_rate > 0) {
+    if (avctx->bit_rate > 0)
         ctx->encode_config.rcParams.averageBitRate = avctx->bit_rate;
-    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
-        ctx->encode_config.rcParams.maxBitRate = ctx->encode_config.rcParams.averageBitRate;
-    }
 
     if (avctx->rc_max_rate > 0)
         ctx->encode_config.rcParams.maxBitRate = avctx->rc_max_rate;
@@ -853,11 +713,8 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         }
     }
 
-    if (avctx->rc_buffer_size > 0) {
+    if (avctx->rc_buffer_size > 0)
         ctx->encode_config.rcParams.vbvBufferSize = avctx->rc_buffer_size;
-    } else if (ctx->encode_config.rcParams.averageBitRate > 0) {
-        ctx->encode_config.rcParams.vbvBufferSize = 2 * ctx->encode_config.rcParams.averageBitRate;
-    }
 
     if (avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) {
         ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FIELD;
@@ -865,6 +722,15 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         ctx->encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
     }
 
+    if (ctx->aq)
+    {
+        ctx->encode_config.rcParams.enableAQ = 1;
+    }
+    else
+    {
+        ctx->encode_config.rcParams.enableAQ = 0;
+    }
+
     switch (avctx->codec->id) {
     case AV_CODEC_ID_H264:
         ctx->encode_config.encodeCodecConfig.h264Config.h264VUIParameters.colourDescriptionPresentFlag = 1;
@@ -881,9 +747,6 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
 
         if (!ctx->profile) {
             switch (avctx->profile) {
-            case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
-                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-                break;
             case FF_PROFILE_H264_BASELINE:
                 ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
                 break;
@@ -909,9 +772,6 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             } else if (!strcmp(ctx->profile, "baseline")) {
                 ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID;
                 avctx->profile = FF_PROFILE_H264_BASELINE;
-            } else if (!strcmp(ctx->profile, "high444p")) {
-                ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-                avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
             } else {
                 av_log(avctx, AV_LOG_FATAL, "Profile \"%s\" is unknown! Supported profiles: high, main, baseline\n", ctx->profile);
                 res = AVERROR(EINVAL);
@@ -919,13 +779,7 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             }
         }
 
-        // force setting profile as high444p if input is AV_PIX_FMT_YUV444P
-        if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
-            ctx->encode_config.profileGUID = NV_ENC_H264_PROFILE_HIGH_444_GUID;
-            avctx->profile = FF_PROFILE_H264_HIGH_444_PREDICTIVE;
-        }
-
-        ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = avctx->profile == FF_PROFILE_H264_HIGH_444_PREDICTIVE ? 3 : 1;
+        ctx->encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
 
         if (ctx->level) {
             res = input_string_to_uint32(avctx, nvenc_h264_level_pairs, ctx->level, &ctx->encode_config.encodeCodecConfig.h264Config.level);
@@ -981,6 +835,8 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         goto error;
     }
 
+    av_log(avctx, AV_LOG_VERBOSE, "Nvenc initialized successfully\n");
+
     ctx->input_surfaces = av_malloc(ctx->max_surface_count * sizeof(*ctx->input_surfaces));
 
     if (!ctx->input_surfaces) {
@@ -995,28 +851,32 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
         goto error;
     }
 
+    // Allocation for temp surface used for sys mem -> device mem transfer
+    if (avctx->pix_fmt == AV_PIX_FMT_YUV420P)
+    {
+		ctx->transferSurf.width = (avctx->width + 31) & ~31;
+        ctx->transferSurf.height = (avctx->height + 31) & ~31;
+        p_cuda->cu_mem_alloc_pitch(&ctx->transferSurf.dptr,
+                &ctx->transferSurf.pitch,
+                ctx->transferSurf.width,
+                ctx->transferSurf.height/ 2, 16);
+    }
+
     for (surfaceCount = 0; surfaceCount < ctx->max_surface_count; ++surfaceCount) {
-        NV_ENC_CREATE_INPUT_BUFFER allocSurf = { 0 };
         NV_ENC_CREATE_BITSTREAM_BUFFER allocOut = { 0 };
-        allocSurf.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
         allocOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
 
-        allocSurf.width = (avctx->width + 31) & ~31;
-        allocSurf.height = (avctx->height + 31) & ~31;
-
-        allocSurf.memoryHeap = NV_ENC_MEMORY_HEAP_SYSMEM_CACHED;
+        ctx->input_surfaces[surfaceCount].width = (avctx->width + 31) & ~31;
+        ctx->input_surfaces[surfaceCount].height = (avctx->height + 31) & ~31;
 
         switch (avctx->pix_fmt) {
         case AV_PIX_FMT_YUV420P:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YV12_PL;
-            break;
-
         case AV_PIX_FMT_NV12:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL;
-            break;
-
-        case AV_PIX_FMT_YUV444P:
-            allocSurf.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL;
+            ctx->input_surfaces[surfaceCount].format = NV_ENC_BUFFER_FORMAT_NV12_PL;
+            p_cuda->cu_mem_alloc_pitch(&ctx->input_surfaces[surfaceCount].dptr,
+                    &ctx->input_surfaces[surfaceCount].pitch,
+                    ctx->input_surfaces[surfaceCount].width,
+                    ctx->input_surfaces[surfaceCount].height * 3 / 2, 16);
             break;
 
         default:
@@ -1025,18 +885,21 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
             goto error;
         }
 
-        nv_status = p_nvenc->nvEncCreateInputBuffer(ctx->nvencoder, &allocSurf);
+        registerParams.version = NV_ENC_REGISTER_RESOURCE_VER,
+        registerParams.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR,
+        registerParams.width = ctx->input_surfaces[surfaceCount].width,
+        registerParams.height = ctx->input_surfaces[surfaceCount].height,
+        registerParams.pitch = ctx->input_surfaces[surfaceCount].pitch,
+        registerParams.bufferFormat = ctx->input_surfaces[surfaceCount].format;
+        registerParams.resourceToRegister = (void*)ctx->input_surfaces[surfaceCount].dptr,
+        nv_status = p_nvenc->nvEncRegisterResource(ctx->nvencoder, &registerParams);
         if (nv_status != NV_ENC_SUCCESS) {
-            av_log(avctx, AV_LOG_FATAL, "CreateInputBuffer failed\n");
+            av_log(avctx, AV_LOG_FATAL, "RegisterResource failed\n");
             res = AVERROR_EXTERNAL;
             goto error;
         }
-
+        ctx->input_surfaces[surfaceCount].hRes = registerParams.registeredResource;
         ctx->input_surfaces[surfaceCount].lockCount = 0;
-        ctx->input_surfaces[surfaceCount].input_surface = allocSurf.inputBuffer;
-        ctx->input_surfaces[surfaceCount].format = allocSurf.bufferFmt;
-        ctx->input_surfaces[surfaceCount].width = allocSurf.width;
-        ctx->input_surfaces[surfaceCount].height = allocSurf.height;
 
         /* 1MB is large enough to hold most output frames. NVENC increases this automaticaly if it's not enough. */
         allocOut.size = 1024 * 1024;
@@ -1092,21 +955,22 @@ static av_cold int nvenc_encode_init(AVCodecContext *avctx)
     return 0;
 
 error:
-
     for (i = 0; i < surfaceCount; ++i) {
-        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+        p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
+
         if (ctx->output_surfaces[i].output_surface)
             p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
     }
 
+	p_cuda->cu_mem_free(ctx->transferSurf.dptr);
     if (ctx->nvencoder)
         p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
 
     if (ctx->cu_context)
-        dl_fn->cu_ctx_destroy(ctx->cu_context);
+        release_cuda_context(&ctx->cu_context, ctx->gpu);
 
     nvenc_unload_nvenc(avctx);
-
     ctx->nvencoder = NULL;
     ctx->cu_context = NULL;
 
@@ -1118,6 +982,7 @@ static av_cold int nvenc_encode_close(AVCodecContext *avctx)
     NvencContext *ctx = avctx->priv_data;
     NvencDynLoadFunctions *dl_fn = &ctx->nvenc_dload_funcs;
     NV_ENCODE_API_FUNCTION_LIST *p_nvenc = &dl_fn->nvenc_funcs;
+    CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
     int i;
 
     av_freep(&ctx->timestamp_list.data);
@@ -1125,16 +990,19 @@ static av_cold int nvenc_encode_close(AVCodecContext *avctx)
     av_freep(&ctx->output_surface_queue.data);
 
     for (i = 0; i < ctx->max_surface_count; ++i) {
-        p_nvenc->nvEncDestroyInputBuffer(ctx->nvencoder, ctx->input_surfaces[i].input_surface);
+        p_nvenc->nvEncUnregisterResource(ctx->nvencoder, ctx->input_surfaces[i].hRes);
+        p_cuda->cu_mem_free(ctx->input_surfaces[i].dptr);
         p_nvenc->nvEncDestroyBitstreamBuffer(ctx->nvencoder, ctx->output_surfaces[i].output_surface);
     }
     ctx->max_surface_count = 0;
 
-    p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+    if (ctx->nvencoder)
+        p_nvenc->nvEncDestroyEncoder(ctx->nvencoder);
+
     ctx->nvencoder = NULL;
 
-    dl_fn->cu_ctx_destroy(ctx->cu_context);
-    ctx->cu_context = NULL;
+    if (ctx->cu_context)
+        release_cuda_context(&ctx->cu_context, ctx->gpu);
 
     nvenc_unload_nvenc(avctx);
 
@@ -1149,6 +1017,7 @@ static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencOut
 
     uint32_t slice_mode_data;
     uint32_t *slice_offsets;
+    char picType = 'X';
     NV_ENC_LOCK_BITSTREAM lock_params = { 0 };
     NVENCSTATUS nv_status;
     int res = 0;
@@ -1201,12 +1070,15 @@ static int process_output_surface(AVCodecContext *avctx, AVPacket *pkt, NvencOut
 FF_DISABLE_DEPRECATION_WARNINGS
     case NV_ENC_PIC_TYPE_I:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
+        picType = 'I';
         break;
     case NV_ENC_PIC_TYPE_P:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_P;
+        picType = 'P';
         break;
     case NV_ENC_PIC_TYPE_B:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_B;
+        picType = 'B';
         break;
     case NV_ENC_PIC_TYPE_BI:
         avctx->coded_frame->pict_type = AV_PICTURE_TYPE_BI;
@@ -1220,6 +1092,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
     }
 
+    av_log(avctx, AV_LOG_VERBOSE, "FRAME STATISTICS: Frame No. %d  PicType %c Frame AvgQP %d  SATD Cost %d  Size %d bytes\r", lock_params.frameIdx, picType, lock_params.frameAvgQP, lock_params.frameSatd, lock_params.bitstreamSizeInBytes);
+
     pkt->pts = lock_params.outputTimeStamp;
     pkt->dts = timestamp_queue_dequeue(&ctx->timestamp_list);
 
@@ -1247,6 +1121,61 @@ error:
     return res;
 }
 
+
+static int call_interleavechroma_kernel(CudaDynLoadFunctions* dl_func, CUfunction func,
+    CUdeviceptr cb_dptr, CUdeviceptr cr_dptr, CUdeviceptr nv12chroma_dptr, int width, int height, int srcStride, int dstStride)
+{
+    void *args_uchar[] = { &cb_dptr, &cr_dptr, &nv12chroma_dptr, &width, &height, &srcStride, &dstStride};
+    __cu(dl_func->cu_launch_kernel(func, BLOCKSX, 1, 1, THREADSX, 1, 1, 0, NULL, args_uchar, NULL));
+
+    return 0;
+}
+
+static int nvenc_copy_to_inputbuffer(NvencContext *ctx, const AVFrame* frame, NvencInputSurface *inSurf)
+{
+    CudaDynLoadFunctions *p_cuda = get_cuda_dl_func();
+    if (frame->format == AV_PIX_FMT_NV12) {
+
+        // check opaque field, if there's already a deviceptr
+        if (frame->opaque && check_nvinfo(frame->opaque) &&
+            ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+            ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+            __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(NULL, info->dptr[1], info->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else
+        {
+            __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, inSurf->dptr + inSurf->pitch*inSurf->height, inSurf->pitch, frame->width, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        }
+    }
+    else if (frame->format == AV_PIX_FMT_YUV420P) {
+        // check opaque field, if there's already a deviceptr
+        if (frame->opaque && check_nvinfo(frame->opaque) &&
+            ((ffnvinfo*)(frame->opaque))->dptr[0]) {
+            ffnvinfo* info = (ffnvinfo*)frame->opaque;
+
+            __cu(cuMemCpy2d(NULL, info->dptr[0], info->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+            call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, info->dptr[1], info->dptr[2], inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width+31) & ~31 , frame->height, (info->linesize[1]<<1), inSurf->pitch);
+        }
+        else
+        {
+            __cu(cuMemCpy2d(frame->data[0], (CUdeviceptr)NULL, frame->linesize[0], NULL, inSurf->dptr, inSurf->pitch, frame->width, frame->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[1], (CUdeviceptr)NULL, frame->linesize[1], NULL, ctx->transferSurf.dptr, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+            __cu(cuMemCpy2d(frame->data[2], (CUdeviceptr)NULL, frame->linesize[2], NULL, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height / 4, ctx->transferSurf.pitch / 2, ctx->transferSurf.width / 2, frame->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+            call_interleavechroma_kernel(p_cuda, ctx->cu_func_interleaveChroma, ctx->transferSurf.dptr, ctx->transferSurf.dptr + ctx->transferSurf.pitch*ctx->transferSurf.height/4, inSurf->dptr + inSurf->pitch*inSurf->height, (frame->width + 31) & ~31, frame->height, ctx->transferSurf.pitch, inSurf->pitch);
+        }
+    }
+    else {
+        av_log(NULL, AV_LOG_FATAL, "Invalid pixel format!\n");
+        return AVERROR(EINVAL);
+    }
+
+    return 0;
+}
+
 static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     const AVFrame *frame, int *got_packet)
 {
@@ -1262,7 +1191,7 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
     pic_params.version = NV_ENC_PIC_PARAMS_VER;
 
     if (frame) {
-        NV_ENC_LOCK_INPUT_BUFFER lockBufferParams = { 0 };
+        NV_ENC_MAP_INPUT_RESOURCE mapParams = { 0 };
         NvencInputSurface *inSurf = NULL;
 
         for (i = 0; i < ctx->max_surface_count; ++i) {
@@ -1276,69 +1205,27 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 
         inSurf->lockCount = 1;
 
-        lockBufferParams.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
-        lockBufferParams.inputBuffer = inSurf->input_surface;
-
-        nv_status = p_nvenc->nvEncLockInputBuffer(ctx->nvencoder, &lockBufferParams);
+        mapParams.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+        mapParams.registeredResource = inSurf->hRes;
+        nv_status = p_nvenc->nvEncMapInputResource(ctx->nvencoder, &mapParams);
         if (nv_status != NV_ENC_SUCCESS) {
-            av_log(avctx, AV_LOG_ERROR, "Failed locking nvenc input buffer\n");
+            av_log(avctx, AV_LOG_ERROR, "Failed mapping nvenc input buffer\n");
             return 0;
         }
 
-        if (avctx->pix_fmt == AV_PIX_FMT_YUV420P) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
-                frame->data[2], frame->linesize[2],
-                avctx->width >> 1, avctx->height >> 1);
-
-            buf += (inSurf->height * lockBufferParams.pitch) >> 2;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch >> 1,
-                frame->data[1], frame->linesize[1],
-                avctx->width >> 1, avctx->height >> 1);
-        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[1], frame->linesize[1],
-                avctx->width, avctx->height >> 1);
-        } else if (avctx->pix_fmt == AV_PIX_FMT_YUV444P) {
-            uint8_t *buf = lockBufferParams.bufferDataPtr;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[0], frame->linesize[0],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
-
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[1], frame->linesize[1],
-                avctx->width, avctx->height);
-
-            buf += inSurf->height * lockBufferParams.pitch;
+        inSurf->input_surface = mapParams.mappedResource;
+        if (inSurf->format != mapParams.mappedBufferFmt) {
+            av_log(avctx, AV_LOG_ERROR, "Incompatible buffer format!\n");
+            return 0;
+        }
 
-            av_image_copy_plane(buf, lockBufferParams.pitch,
-                frame->data[2], frame->linesize[2],
-                avctx->width, avctx->height);
-        } else {
-            av_log(avctx, AV_LOG_FATAL, "Invalid pixel format!\n");
-            return AVERROR(EINVAL);
+        if (nvenc_copy_to_inputbuffer(ctx, frame, inSurf) != 0) {
+            p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
+            av_log(avctx, AV_LOG_ERROR, "Failed to copy data to NVENC input buffer!\n");
+            return 0;
         }
 
-        nv_status = p_nvenc->nvEncUnlockInputBuffer(ctx->nvencoder, inSurf->input_surface);
+        nv_status = p_nvenc->nvEncUnmapInputResource(ctx->nvencoder, inSurf->input_surface);
         if (nv_status != NV_ENC_SUCCESS) {
             av_log(avctx, AV_LOG_FATAL, "Failed unlocking input buffer!\n");
             return AVERROR_EXTERNAL;
@@ -1456,33 +1343,31 @@ static int nvenc_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 static const enum AVPixelFormat pix_fmts_nvenc[] = {
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_NV12,
-    AV_PIX_FMT_YUV444P,
     AV_PIX_FMT_NONE
 };
 
 #define OFFSET(x) offsetof(NvencContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
-    { "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "medium" }, 0, 0, VE },
-    { "profile", "Set the encoding profile (high, main, baseline or high444p)", OFFSET(profile), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
-    { "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { .str = "auto" }, 0, 0, VE },
-    { "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { .str = "main" }, 0, 0, VE },
+    { "preset", "Set the encoding preset (one of slow = hq 2pass, medium = hq, fast = hp, hq, hp, bd, ll, llhq, llhp, default)", OFFSET(preset), AV_OPT_TYPE_STRING, { .str = "hq" }, 0, 0, VE },
+    { "profile", "Set the encoding profile (high, main, baseline)", OFFSET(profile), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "level", "Set the encoding level restriction (auto, 1.0, 1.0b, 1.1, 1.2, ..., 4.2, 5.0, 5.1)", OFFSET(level), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
+    { "tier", "Set the encoding tier (main or high)", OFFSET(tier), AV_OPT_TYPE_STRING, { 0 }, 0, 0, VE },
     { "cbr", "Use cbr encoding mode", OFFSET(cbr), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { "2pass", "Use 2pass encoding mode", OFFSET(twopass), AV_OPT_TYPE_BOOL, { .i64 = -1 }, -1, 1, VE },
     { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE },
     { "delay", "Delays frame output by the given amount of frames.", OFFSET(buffer_delay), AV_OPT_TYPE_INT, { .i64 = INT_MAX }, 0, INT_MAX, VE },
+    { "enableaq", "set to 1 to enable AQ ", OFFSET(aq), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VE },
     { NULL }
 };
 
 static const AVCodecDefault nvenc_defaults[] = {
-    { "b", "2M" },
+    { "b", "0" },
     { "qmin", "-1" },
     { "qmax", "-1" },
     { "qdiff", "-1" },
     { "qblur", "-1" },
     { "qcomp", "-1" },
-    { "g", "250" },
-    { "bf", "0" },
     { NULL },
 };
 
diff --git a/libavcodec/nvenc_ptx.c b/libavcodec/nvenc_ptx.c
new file mode 100644
index 0000000..ad40a7f
--- /dev/null
+++ b/libavcodec/nvenc_ptx.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char color_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19830389\n"
+	"// Cuda compilation tools, release 8.0, V8.0.0\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.3\n"
+	".target sm_30\n"
+	".address_size 32\n"
+	"\n"
+	"// .globl	interleaveChroma\n"
+	"\n"
+	".visible .entry interleaveChroma(\n"
+	".param .u32 interleaveChroma_param_0,\n"
+	".param .u32 interleaveChroma_param_1,\n"
+	".param .u32 interleaveChroma_param_2,\n"
+	".param .u32 interleaveChroma_param_3,\n"
+	".param .u32 interleaveChroma_param_4,\n"
+	".param .u32 interleaveChroma_param_5,\n"
+	".param .u32 interleaveChroma_param_6\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<5>;\n"
+	".reg .b32 	%r<57>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r15, [interleaveChroma_param_0];\n"
+	"ld.param.u32 	%r16, [interleaveChroma_param_1];\n"
+	"ld.param.u32 	%r17, [interleaveChroma_param_2];\n"
+	"ld.param.u32 	%r18, [interleaveChroma_param_3];\n"
+	"ld.param.u32 	%r21, [interleaveChroma_param_4];\n"
+	"ld.param.u32 	%r19, [interleaveChroma_param_5];\n"
+	"ld.param.u32 	%r20, [interleaveChroma_param_6];\n"
+	"shr.s32 	%r1, %r21, 1;\n"
+	"mov.u32 	%r55, %ctaid.x;\n"
+	"setp.ge.s32	%p1, %r55, %r1;\n"
+	"@%p1 bra 	BB0_6;\n"
+	"\n"
+	"cvta.to.global.u32 	%r3, %r17;\n"
+	"cvta.to.global.u32 	%r4, %r16;\n"
+	"cvta.to.global.u32 	%r5, %r15;\n"
+	"mov.u32 	%r6, %tid.x;\n"
+	"shr.s32 	%r7, %r18, 3;\n"
+	"mov.u32 	%r8, %ntid.x;\n"
+	"\n"
+	"BB0_2:\n"
+	"setp.ge.s32	%p2, %r6, %r7;\n"
+	"@%p2 bra 	BB0_5;\n"
+	"\n"
+	"mul.lo.s32 	%r22, %r55, %r19;\n"
+	"mul.lo.s32 	%r23, %r55, %r20;\n"
+	"shr.s32 	%r10, %r23, 2;\n"
+	"shr.s32 	%r24, %r22, 2;\n"
+	"shr.u32 	%r11, %r24, 1;\n"
+	"mov.u32 	%r56, %r6;\n"
+	"\n"
+	"BB0_4:\n"
+	"mov.u32 	%r12, %r56;\n"
+	"add.s32 	%r25, %r12, %r11;\n"
+	"shl.b32 	%r26, %r25, 2;\n"
+	"add.s32 	%r27, %r5, %r26;\n"
+	"add.s32 	%r28, %r4, %r26;\n"
+	"ld.global.u32 	%r29, [%r28];\n"
+	"and.b32  	%r30, %r29, 65280;\n"
+	"shl.b32 	%r31, %r30, 16;\n"
+	"ld.global.u32 	%r32, [%r27];\n"
+	"shl.b32 	%r33, %r32, 8;\n"
+	"and.b32  	%r34, %r33, 16711680;\n"
+	"shl.b32 	%r35, %r29, 8;\n"
+	"and.b32  	%r36, %r35, 65280;\n"
+	"and.b32  	%r37, %r32, 255;\n"
+	"or.b32  	%r38, %r34, %r37;\n"
+	"or.b32  	%r39, %r38, %r31;\n"
+	"or.b32  	%r40, %r39, %r36;\n"
+	"shl.b32 	%r41, %r12, 1;\n"
+	"add.s32 	%r42, %r41, %r10;\n"
+	"shl.b32 	%r43, %r42, 2;\n"
+	"add.s32 	%r44, %r3, %r43;\n"
+	"st.global.u32 	[%r44], %r40;\n"
+	"and.b32  	%r45, %r29, -16777216;\n"
+	"and.b32  	%r46, %r32, -16777216;\n"
+	"shr.u32 	%r47, %r46, 8;\n"
+	"or.b32  	%r48, %r45, %r47;\n"
+	"and.b32  	%r49, %r29, 16711680;\n"
+	"shr.u32 	%r50, %r49, 8;\n"
+	"bfe.u32 	%r51, %r32, 16, 8;\n"
+	"or.b32  	%r52, %r48, %r51;\n"
+	"or.b32  	%r53, %r52, %r50;\n"
+	"st.global.u32 	[%r44+4], %r53;\n"
+	"add.s32 	%r13, %r8, %r12;\n"
+	"setp.lt.s32	%p3, %r13, %r7;\n"
+	"mov.u32 	%r56, %r13;\n"
+	"@%p3 bra 	BB0_4;\n"
+	"\n"
+	"BB0_5:\n"
+	"mov.u32 	%r54, %nctaid.x;\n"
+	"add.s32 	%r55, %r54, %r55;\n"
+	"setp.lt.s32	%p4, %r55, %r1;\n"
+	"@%p4 bra 	BB0_2;\n"
+	"\n"
+	"BB0_6:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#elif defined ENVIRONMENT64
+const char color_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19830389\n"
+	"// Cuda compilation tools, release 8.0, V8.0.0\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.3\n"
+	".target sm_30\n"
+	".address_size 64\n"
+	"\n"
+	"// .globl	interleaveChroma\n"
+	"\n"
+	".visible .entry interleaveChroma(\n"
+	".param .u64 interleaveChroma_param_0,\n"
+	".param .u64 interleaveChroma_param_1,\n"
+	".param .u64 interleaveChroma_param_2,\n"
+	".param .u32 interleaveChroma_param_3,\n"
+	".param .u32 interleaveChroma_param_4,\n"
+	".param .u32 interleaveChroma_param_5,\n"
+	".param .u32 interleaveChroma_param_6\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<5>;\n"
+	".reg .b32 	%r<47>;\n"
+	".reg .b64 	%rd<14>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd4, [interleaveChroma_param_0];\n"
+	"ld.param.u64 	%rd5, [interleaveChroma_param_1];\n"
+	"ld.param.u64 	%rd6, [interleaveChroma_param_2];\n"
+	"ld.param.u32 	%r12, [interleaveChroma_param_3];\n"
+	"ld.param.u32 	%r15, [interleaveChroma_param_4];\n"
+	"ld.param.u32 	%r13, [interleaveChroma_param_5];\n"
+	"ld.param.u32 	%r14, [interleaveChroma_param_6];\n"
+	"shr.s32 	%r1, %r15, 1;\n"
+	"mov.u32 	%r45, %ctaid.x;\n"
+	"setp.ge.s32	%p1, %r45, %r1;\n"
+	"@%p1 bra 	BB0_6;\n"
+	"\n"
+	"cvta.to.global.u64 	%rd1, %rd6;\n"
+	"cvta.to.global.u64 	%rd2, %rd5;\n"
+	"cvta.to.global.u64 	%rd3, %rd4;\n"
+	"mov.u32 	%r3, %tid.x;\n"
+	"shr.s32 	%r4, %r12, 3;\n"
+	"mov.u32 	%r5, %ntid.x;\n"
+	"\n"
+	"BB0_2:\n"
+	"setp.ge.s32	%p2, %r3, %r4;\n"
+	"@%p2 bra 	BB0_5;\n"
+	"\n"
+	"mul.lo.s32 	%r16, %r45, %r13;\n"
+	"mul.lo.s32 	%r17, %r45, %r14;\n"
+	"shr.s32 	%r7, %r17, 2;\n"
+	"shr.s32 	%r18, %r16, 2;\n"
+	"shr.u32 	%r8, %r18, 1;\n"
+	"mov.u32 	%r46, %r3;\n"
+	"\n"
+	"BB0_4:\n"
+	"mov.u32 	%r9, %r46;\n"
+	"add.s32 	%r19, %r9, %r8;\n"
+	"mul.wide.u32 	%rd7, %r19, 4;\n"
+	"add.s64 	%rd8, %rd3, %rd7;\n"
+	"add.s64 	%rd9, %rd2, %rd7;\n"
+	"ld.global.u32 	%r20, [%rd9];\n"
+	"and.b32  	%r21, %r20, 65280;\n"
+	"shl.b32 	%r22, %r21, 16;\n"
+	"ld.global.u32 	%r23, [%rd8];\n"
+	"shl.b32 	%r24, %r23, 8;\n"
+	"and.b32  	%r25, %r24, 16711680;\n"
+	"shl.b32 	%r26, %r20, 8;\n"
+	"and.b32  	%r27, %r26, 65280;\n"
+	"and.b32  	%r28, %r23, 255;\n"
+	"or.b32  	%r29, %r25, %r28;\n"
+	"or.b32  	%r30, %r29, %r22;\n"
+	"or.b32  	%r31, %r30, %r27;\n"
+	"shl.b32 	%r32, %r9, 1;\n"
+	"add.s32 	%r33, %r32, %r7;\n"
+	"mul.wide.u32 	%rd10, %r33, 4;\n"
+	"add.s64 	%rd11, %rd1, %rd10;\n"
+	"st.global.u32 	[%rd11], %r31;\n"
+	"and.b32  	%r34, %r20, -16777216;\n"
+	"and.b32  	%r35, %r23, -16777216;\n"
+	"shr.u32 	%r36, %r35, 8;\n"
+	"or.b32  	%r37, %r34, %r36;\n"
+	"and.b32  	%r38, %r20, 16711680;\n"
+	"shr.u32 	%r39, %r38, 8;\n"
+	"bfe.u32 	%r40, %r23, 16, 8;\n"
+	"or.b32  	%r41, %r37, %r40;\n"
+	"or.b32  	%r42, %r41, %r39;\n"
+	"add.s32 	%r43, %r33, 1;\n"
+	"mul.wide.u32 	%rd12, %r43, 4;\n"
+	"add.s64 	%rd13, %rd1, %rd12;\n"
+	"st.global.u32 	[%rd13], %r42;\n"
+	"add.s32 	%r10, %r5, %r9;\n"
+	"setp.lt.s32	%p3, %r10, %r4;\n"
+	"mov.u32 	%r46, %r10;\n"
+	"@%p3 bra 	BB0_4;\n"
+	"\n"
+	"BB0_5:\n"
+	"mov.u32 	%r44, %nctaid.x;\n"
+	"add.s32 	%r45, %r44, %r45;\n"
+	"setp.lt.s32	%p4, %r45, %r1;\n"
+	"@%p4 bra 	BB0_2;\n"
+	"\n"
+	"BB0_6:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#endif
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 1f4abeb..30240e4 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -180,6 +180,7 @@ OBJS-$(CONFIG_NEGATE_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_NOFORMAT_FILTER)               += vf_format.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += vf_noise.o
 OBJS-$(CONFIG_NULL_FILTER)                   += vf_null.o
+OBJS-$(CONFIG_NVRESIZE_FILTER)               += vf_nvresize.o vf_nvresize_ptx.o
 OBJS-$(CONFIG_OCR_FILTER)                    += vf_ocr.o
 OBJS-$(CONFIG_OCV_FILTER)                    += vf_libopencv.o
 OBJS-$(CONFIG_OPENCL)                        += deshake_opencl.o unsharp_opencl.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 63b8fdb..f02b791 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -272,6 +272,7 @@ void avfilter_register_all(void)
     REGISTER_FILTER(ZMQ,            zmq,            vf);
     REGISTER_FILTER(ZOOMPAN,        zoompan,        vf);
     REGISTER_FILTER(ZSCALE,         zscale,         vf);
+    REGISTER_FILTER(NVRESIZE,       nvresize,       vf);
 
     REGISTER_FILTER(ALLRGB,         allrgb,         vsrc);
     REGISTER_FILTER(ALLYUV,         allyuv,         vsrc);
diff --git a/libavfilter/vf_nvresize.c b/libavfilter/vf_nvresize.c
new file mode 100644
index 0000000..c93ce7d
--- /dev/null
+++ b/libavfilter/vf_nvresize.c
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* External headers */
+#include <cudautils.h>
+
+/* FFmpeg headers */
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/eval.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/parseutils.h"
+
+#include "avfilter.h"
+#include "drawutils.h"
+#include "formats.h"
+#include "internal.h"
+#include "video.h"
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define MAX_OUTPUT 16
+#define BLOCKX 32
+#define BLOCKY 16
+
+typedef struct cu_tex {
+    int w;
+    int h;
+    size_t pitch;
+    CUdeviceptr dptr;
+} cu_tex;
+
+typedef struct NVResizeContext {
+    const AVClass *class;
+
+    /**
+    * New dimensions. Special values are:
+    *   0 = original width/height
+    *  -1 = keep original aspect
+    *  -N = try to keep aspect but make sure it is divisible by N
+    */
+    int nb_outputs;
+
+    char *size_str;
+    int force_original_aspect_ratio;
+    int readback_FB;
+    int gpu;
+
+    int cuda_inited;
+
+    CUcontext   cu_ctx;
+    CudaDynLoadFunctions* cu_dl_func;
+    CUmodule    cu_module;
+    CUfunction  cu_func_uchar;
+    CUfunction  cu_func_uchar2;
+    CUfunction  cu_func_uchar4;
+    CUtexref    cu_tex_uchar;
+    CUtexref    cu_tex_uchar2;
+    CUtexref    cu_tex_uchar4;
+    cu_tex      intex;
+    cu_tex      outtex[MAX_OUTPUT];
+
+} NVResizeContext;
+
+#define OFFSET(x) offsetof(NVResizeContext, x)
+#define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
+
+static const AVOption nvresize_options[] = {
+    { "outputs",  "set number of outputs",  OFFSET(nb_outputs),  AV_OPT_TYPE_INT, { .i64 = 1 }, 1, MAX_OUTPUT, FLAGS },
+    { "readback", "read result back to FB", OFFSET(readback_FB), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, FLAGS },
+    { "size",     "set video size",         OFFSET(size_str),    AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+    { "s",        "set video size",         OFFSET(size_str),    AV_OPT_TYPE_STRING, {.str = NULL}, 0, FLAGS },
+    { "gpu", "Selects which NVENC capable GPU to use. First GPU is 0, second is 1, and so on.", OFFSET(gpu), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },
+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(nvresize);
+
+static int query_formats(AVFilterContext *ctx)
+{
+    static const enum AVPixelFormat pix_fmts[] = {
+        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_YUV444P,
+        AV_PIX_FMT_NV12,
+        AV_PIX_FMT_ARGB,
+        AV_PIX_FMT_RGBA,
+        AV_PIX_FMT_ABGR,
+        AV_PIX_FMT_BGRA,
+        AV_PIX_FMT_NONE,
+    };
+
+    AVFilterFormats *fmts_list = ff_make_format_list((const int*)pix_fmts);
+    if (!fmts_list)
+        return AVERROR(ENOMEM);
+    return ff_set_common_formats(ctx, fmts_list);
+}
+
+static int config_output(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    NVResizeContext *s = ctx->priv;
+
+    int outIdx = atoi(outlink->srcpad->name + 3);
+    int64_t w, h;
+    int factor_w, factor_h;
+
+    w = s->outtex[outIdx].w;
+    h = s->outtex[outIdx].h;
+
+    // Check if it is requested that the result has to be divisible by a some
+    // factor (w or h = -n with n being the factor).
+    factor_w = 1;
+    factor_h = 1;
+    if (w < -1) {
+        factor_w = -w;
+    }
+    if (h < -1) {
+        factor_h = -h;
+    }
+
+    if (w < 0 && h < 0)
+        s->outtex[outIdx].w = s->outtex[outIdx].h = 0;
+
+    if (!(w = s->outtex[outIdx].w))
+        w = inlink->w;
+    if (!(h = s->outtex[outIdx].h))
+        h = inlink->h;
+
+    // Make sure that the result is divisible by the factor we determined
+    // earlier. If no factor was set, it is nothing will happen as the default
+    // factor is 1
+    if (w < 0)
+        w = av_rescale(h, inlink->w, inlink->h * factor_w) * factor_w;
+    if (h < 0)
+        h = av_rescale(w, inlink->h, inlink->w * factor_h) * factor_h;
+
+    // Note that force_original_aspect_ratio may overwrite the previous set
+    // dimensions so that it is not divisible by the set factors anymore.
+    if (s->force_original_aspect_ratio) {
+        int tmp_w = av_rescale(h, inlink->w, inlink->h);
+        int tmp_h = av_rescale(w, inlink->h, inlink->w);
+
+        if (s->force_original_aspect_ratio == 1) {
+             w = FFMIN(tmp_w, w);
+             h = FFMIN(tmp_h, h);
+        } else {
+             w = FFMAX(tmp_w, w);
+             h = FFMAX(tmp_h, h);
+        }
+    }
+
+    if (w > INT_MAX || h > INT_MAX ||
+        (h * inlink->w) > INT_MAX  ||
+        (w * inlink->h) > INT_MAX)
+        av_log(ctx, AV_LOG_ERROR, "Resd value for width or height is too big.\n");
+
+    s->outtex[outIdx].w = outlink->w = w;
+    s->outtex[outIdx].h = outlink->h = h;
+
+    if (inlink->sample_aspect_ratio.num){
+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
+    } else
+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+
+    // create output device memory
+    switch(outlink->format) {
+    case AV_PIX_FMT_YUV420P:
+    case AV_PIX_FMT_NV12:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3/2, 16));
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w, s->outtex[outIdx].h*3, 16));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->outtex[outIdx].dptr,
+                &s->outtex[outIdx].pitch, s->outtex[outIdx].w*4, s->outtex[outIdx].h, 16));
+        break;
+    }
+
+    return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+    extern char resize_ptx[];
+    NVResizeContext *s = ctx->priv;
+    int ret;
+    int i, j;
+    int count = 0;
+    for (i = 0; i < s->nb_outputs; i++) {
+        char name[32];
+        AVFilterPad pad = { 0 };
+
+        snprintf(name, sizeof(name), "out%d", i);
+        pad.type = ctx->filter->inputs[0].type;
+        pad.name = av_strdup(name);
+        pad.config_props = config_output;
+        if (!pad.name)
+            return AVERROR(ENOMEM);
+
+        ff_insert_outpad(ctx, i, &pad);
+    }
+
+    // parse size parameters here
+    if (s->size_str) {
+        char split = '|';
+        char* found = NULL;
+        char* head = s->size_str;
+        while ((found = strchr(head, split)) != NULL) {
+            *found = 0;
+            if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+                av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+                return ret;
+            }
+            head = found+1;
+            count++;
+        }
+
+        if ((ret = av_parse_video_size(&s->outtex[count].w, &s->outtex[count].h, head)) < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Invalid size '%s'\n", head);
+            return ret;
+        }
+        count++;
+    }
+
+    // sort the output
+    for (i = 0; i < count; i++) {
+        for (j = i+1; j < count; j++) {
+            int tempH, tempW;
+            if (s->outtex[i].w < s->outtex[j].w) {
+                tempW = s->outtex[i].w;          tempH = s->outtex[i].h;
+                s->outtex[i].w = s->outtex[j].w; s->outtex[i].h = s->outtex[j].h;
+                s->outtex[j].w = tempW;          s->outtex[j].h = tempH;
+            }
+        }
+    }
+
+    if (count < s->nb_outputs) {
+        int offset = s->nb_outputs - count;
+        for (i = s->nb_outputs-1; i >= offset; i--) {
+            s->outtex[i].w = s->outtex[i-offset].w;
+            s->outtex[i].h = s->outtex[i-offset].h;
+        }
+        for (i = 0; i < offset; i++) {
+            s->outtex[i].w = s->outtex[i].h = 0;
+        }
+    }
+
+    // init cuda_context
+    if (!s->cu_ctx) {
+        init_cuda();
+        get_cuda_context(&s->cu_ctx, s->gpu);
+    }
+    s->cu_dl_func = get_cuda_dl_func();
+
+    __cu(s->cu_dl_func->cu_module_load_data(&s->cu_module, resize_ptx));
+
+    // load functions
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar,   s->cu_module, "Subsample_Bilinear_uchar"));
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar2,  s->cu_module, "Subsample_Bilinear_uchar2"));
+    __cu(s->cu_dl_func->cu_module_get_function(&s->cu_func_uchar4,  s->cu_module, "Subsample_Bilinear_uchar4"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar,  s->cu_module, "uchar_tex"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"));
+    __cu(s->cu_dl_func->cu_module_get_texref(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"));
+
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar,  CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_flags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar,  CU_TR_FILTER_MODE_LINEAR));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR));
+    __cu(s->cu_dl_func->cu_texref_set_filtermode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR));
+
+    return 0;
+}
+
+static int copy_from_avframe(NVResizeContext *s, AVFrame* src, cu_tex* dst)
+{
+    av_assert0(src->width == dst->w && src->height == dst->h);
+
+    switch (src->format) {
+    case AV_PIX_FMT_YUV420P:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy U channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy V channel
+        __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 5 / 4, dst->pitch / 2, src->width / 2, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy U channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy V channel
+        __cu(cuMemCpy2d(src->data[2], (CUdeviceptr)NULL, src->linesize[2], NULL, dst->dptr + dst->pitch*dst->h * 2, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        break;
+
+    case AV_PIX_FMT_NV12:
+        // copy Y channel
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        // copy UV channel
+        __cu(cuMemCpy2d(src->data[1], (CUdeviceptr)NULL, src->linesize[1], NULL, dst->dptr + dst->pitch*dst->h, dst->pitch, src->width, src->height / 2, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        // copy the packed 32-bit plane
+        __cu(cuMemCpy2d(src->data[0], (CUdeviceptr)NULL, src->linesize[0], NULL, dst->dptr, dst->pitch, src->width * 4, src->height, CU_MEMORYTYPE_HOST, CU_MEMORYTYPE_DEVICE));
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(src->format));
+        return -1;
+    }
+    return 0;
+}
+
+static int copy_to_avframe(NVResizeContext* s, cu_tex* src, AVFrame* dst)
+{
+    //av_assert0(src->w == dst->width && src->h == dst->height);
+
+    switch (dst->format) {
+    case AV_PIX_FMT_YUV420P:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy U channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch / 2, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy V channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 5 / 4, src->pitch / 2, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width / 2, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy U channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy V channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h * 2, src->pitch, dst->data[2], (CUdeviceptr)NULL, dst->linesize[2], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+        // copy Y channel
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        // copy UV channel
+        __cu(cuMemCpy2d(NULL, src->dptr + src->pitch*src->h, src->pitch, dst->data[1], (CUdeviceptr)NULL, dst->linesize[1], dst->width, dst->height / 2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        // copy the packed 32-bit plane
+        __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, dst->data[0], (CUdeviceptr)NULL, dst->linesize[0], dst->width * 4, dst->height, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_HOST));
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported output format: %s!\n", av_get_pix_fmt_name(dst->format));
+        return -1;
+    }
+    return 0;
+}
+
+static int call_resize_kernel(CudaDynLoadFunctions* dl_func, CUfunction func, CUtexref tex, int channels,
+                             CUdeviceptr src_dptr, int src_width, int src_height, int src_pitch,
+                             CUdeviceptr dst_dptr, int dst_width, int dst_height, int dst_pitch)
+{
+    void *args_uchar[] = { &dst_dptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height };
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Width  = src_width;
+    desc.Height = src_height;
+    desc.NumChannels = channels;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    __cu(dl_func->cu_texref_set_address_2D(tex, &desc, src_dptr, src_pitch));
+
+    __cu(dl_func->cu_launch_kernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1,
+        BLOCKX, BLOCKY, 1, 0, NULL, args_uchar, NULL));
+
+    return 0;
+}
+
+static int do_cuda_resize(NVResizeContext *s, cu_tex* src, cu_tex* dst, int format)
+{
+    switch (format) {
+    case AV_PIX_FMT_YUV420P:
+        if (src->w == dst->w && src->h == dst->h && src->pitch == dst->pitch) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->pitch, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch/2,
+                    dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h*5/4, src->w/2, src->h/2, src->pitch/2,
+                    dst->dptr+dst->pitch*dst->h*5/4, dst->w/2, dst->h/2, dst->pitch/2);
+        }
+
+        break;
+
+    case AV_PIX_FMT_YUV444P:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h, src->w, src->h, src->pitch,
+                    dst->dptr+dst->pitch*dst->h, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr+src->pitch*src->h*2, src->w, src->h, src->pitch,
+                    dst->dptr+dst->pitch*dst->h*2, dst->w, dst->h, dst->pitch);
+        }
+
+        break;
+
+    case AV_PIX_FMT_NV12:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w, src->h*3/2, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar, s->cu_tex_uchar, 1,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch);
+
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar2, s->cu_tex_uchar2, 2,
+                    src->dptr+src->pitch*src->h, src->w/2, src->h/2, src->pitch,
+                    dst->dptr+dst->pitch*dst->h, dst->w/2, dst->h/2, dst->pitch/2);
+        }
+
+        break;
+
+    case AV_PIX_FMT_ARGB:
+    case AV_PIX_FMT_RGBA:
+    case AV_PIX_FMT_ABGR:
+    case AV_PIX_FMT_BGRA:
+        if (src->w == dst->w && src->h == dst->h) {
+            __cu(cuMemCpy2d(NULL, src->dptr, src->pitch, NULL, dst->dptr, dst->pitch, src->w*4, src->h, CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_DEVICE));
+
+        }
+        else {
+            call_resize_kernel(s->cu_dl_func, s->cu_func_uchar4, s->cu_tex_uchar4, 4,
+                    src->dptr, src->w, src->h, src->pitch,
+                    dst->dptr, dst->w, dst->h, dst->pitch/4);
+        }
+
+        break;
+
+    default:
+        av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(format));
+        return -1;
+    }
+
+    return 0;
+}
+
+static cu_tex* find_resize_src(NVResizeContext* s, cu_tex* source, cu_tex* target)
+{
+    int offset;
+    cu_tex* src;
+    if (source == NULL) {
+        return &s->intex;
+    }
+
+    if (target->w * 4 > source->w) {
+        return source;
+    }
+
+    offset = target - s->outtex;
+    for (int i = offset - 1; i >= 0; i--) {
+        if (target->w * 4 > s->outtex[i].w) {
+            return &s->outtex[i];
+        }
+    }
+
+    src = (offset == 0 ? source : &s->outtex[offset-1]);
+    av_log(NULL, AV_LOG_WARNING, "Output resolution %dx%d differs too much from the previous level %dx%d, "
+            "might cause artificial\n", target->w, target->h, src->w, src->h);
+
+    return src;
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *ctx  = inlink->dst;
+    NVResizeContext *s = ctx->priv;
+    int i;
+    cu_tex* resize_src = NULL;
+    ffnvinfo* info;
+
+    // copy input to gpu
+    if (in->opaque && check_nvinfo(in->opaque) && ((ffnvinfo*)(in->opaque))->dptr[0]) {
+        ffnvinfo* info = (ffnvinfo*)in->opaque;
+        s->intex.dptr = info->dptr[0];
+        s->intex.pitch = info->linesize[0];
+        s->intex.w = in->width;
+        s->intex.h = in->height;
+    }
+    else {
+        if ( (in->width != s->intex.h || in->height != s->intex.h) &&
+             !s->intex.dptr) {
+            __cu(s->cu_dl_func->cu_mem_free(s->intex.dptr));
+            s->intex.w = in->width;
+            s->intex.h = in->height;
+            s->intex.dptr = (CUdeviceptr)NULL;
+        }
+        if (!s->intex.dptr) {
+            switch (in->format) {
+            case AV_PIX_FMT_YUV420P:
+            case AV_PIX_FMT_NV12:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3/2, 16));
+                break;
+            case AV_PIX_FMT_YUV444P:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w, s->intex.h*3, 16));
+                break;
+            case AV_PIX_FMT_ARGB:
+            case AV_PIX_FMT_RGBA:
+            case AV_PIX_FMT_ABGR:
+            case AV_PIX_FMT_BGRA:
+                __cu(s->cu_dl_func->cu_mem_alloc_pitch(&s->intex.dptr, &s->intex.pitch, s->intex.w*4, s->intex.h, 16));
+                break;
+            default:
+                av_log(NULL, AV_LOG_FATAL, "Unsupported input format: %s!\n", av_get_pix_fmt_name(in->format));
+                return -1;
+            }
+        }
+        copy_from_avframe(s, in, &s->intex);
+    }
+
+    for (i = 0; i < ctx->nb_outputs; i++) {
+        AVFrame *out;
+        if (ctx->outputs[i]->status)
+            continue;
+
+        out = ff_get_video_buffer(ctx->outputs[i], ctx->outputs[i]->w, ctx->outputs[i]->h);
+        if (!out) {
+            av_frame_free(&in);
+            return AVERROR(ENOMEM);
+        }
+        av_frame_copy_props(out, in);
+
+        // do works here
+        resize_src = find_resize_src(s, resize_src, &s->outtex[i]);
+        do_cuda_resize(s, resize_src, &s->outtex[i], in->format);
+        info = init_nvinfo();
+        switch (out->format) {
+        case AV_PIX_FMT_YUV444P:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*2;
+            info->linesize[0] = info->linesize[1] = info->linesize[2] = s->outtex[i].pitch;
+            break;
+
+        case AV_PIX_FMT_YUV420P:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->dptr[2] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h*5/4;
+            info->linesize[0] = s->outtex[i].pitch;
+            info->linesize[1] = info->linesize[2] = s->outtex[i].pitch/2;
+            break;
+
+        case AV_PIX_FMT_NV12:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->dptr[1] = s->outtex[i].dptr + s->outtex[i].pitch*s->outtex[i].h;
+            info->linesize[0] = info->linesize[1] = s->outtex[i].pitch;
+            break;
+
+        case AV_PIX_FMT_ARGB:
+        case AV_PIX_FMT_RGBA:
+        case AV_PIX_FMT_ABGR:
+        case AV_PIX_FMT_BGRA:
+            info->dptr[0] = s->outtex[i].dptr;
+            info->linesize[0] = s->outtex[i].pitch;
+            break;
+
+        default:
+            break;
+        }
+
+        out->opaque = (void*)info;
+        if (s->readback_FB)
+            copy_to_avframe(s, &s->outtex[i], out);
+
+        if (ff_filter_frame(ctx->outputs[i], out) < 0)
+            break;
+    }
+
+    av_frame_free(&in);
+    return 0;
+}
+
+
+static av_cold void uninit(AVFilterContext *ctx)
+{
+    NVResizeContext *s = ctx->priv;
+
+    for (int i = 0; i < s->nb_outputs; i++) {
+        av_freep(&ctx->output_pads[i].name);
+        if(s->outtex[i].dptr) s->cu_dl_func->cu_mem_free(s->outtex[i].dptr);
+    }
+    if(s->cu_ctx) release_cuda_context(&s->cu_ctx, s->gpu);
+
+    av_log(ctx, AV_LOG_INFO, "nvresize::uninit\n");
+
+}
+
+static const AVFilterPad nvresize_inputs[] = {
+    {
+        .name           = "default",
+        .type           = AVMEDIA_TYPE_VIDEO,
+        .filter_frame   = filter_frame,
+    },
+    { NULL }
+};
+
+AVFilter ff_vf_nvresize = {
+    .name = "nvresize",
+    .description = NULL_IF_CONFIG_SMALL("GPU accelerated video resizer."),
+    .inputs  = nvresize_inputs,
+    .outputs = NULL,
+    .flags   = AVFILTER_FLAG_DYNAMIC_OUTPUTS,
+    .priv_class = &nvresize_class,
+    .init = init,
+    .uninit = uninit,
+    .query_formats = query_formats,
+    .priv_size = sizeof(NVResizeContext),
+};
diff --git a/libavfilter/vf_nvresize_ptx.c b/libavfilter/vf_nvresize_ptx.c
new file mode 100644
index 0000000..6aa15f5
--- /dev/null
+++ b/libavfilter/vf_nvresize_ptx.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if _WIN32 || _WIN64
+#if _WIN64
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+// Check GCC
+#if __GNUC__
+#if __x86_64__ || __ppc64__
+#define ENVIRONMENT64
+#else
+#define ENVIRONMENT32
+#endif
+#endif
+#ifdef ENVIRONMENT32
+const char resize_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19324607\n"
+	"// Cuda compilation tools, release 7.0, V7.0.27\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.2\n"
+	".target sm_30\n"
+	".address_size 32\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar\n"
+	".global .texref uchar_tex;\n"
+	".global .texref uchar2_tex;\n"
+	".global .texref uchar4_tex;\n"
+	"\n"
+	".visible .entry Subsample_Bilinear_uchar(\n"
+	".param .u32 Subsample_Bilinear_uchar_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<43>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB0_2;\n"
+	"bra.uni 	BB0_1;\n"
+	"\n"
+	"BB0_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar_tex, {%f23, %f24}];\n"
+	"and.b32  	%r20, %r16, 255;\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r21, %r22, %r23, %r24}, [uchar_tex, {%f25, %f24}];\n"
+	"and.b32  	%r25, %r21, 255;\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar_tex, {%f23, %f26}];\n"
+	"and.b32  	%r30, %r26, 255;\n"
+	"tex.2d.v4.u32.f32	{%r31, %r32, %r33, %r34}, [uchar_tex, {%f25, %f26}];\n"
+	"and.b32  	%r35, %r31, 255;\n"
+	"add.s32 	%r36, %r20, %r25;\n"
+	"add.s32 	%r37, %r36, %r30;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"mad.lo.s32 	%r41, %r2, %r6, %r1;\n"
+	"add.s32 	%r42, %r15, %r41;\n"
+	"st.global.u8 	[%r42], %r40;\n"
+	"\n"
+	"BB0_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar2\n"
+	".visible .entry Subsample_Bilinear_uchar2(\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<3>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<53>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar2_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar2_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar2_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar2_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar2_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar2_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB1_2;\n"
+	"bra.uni 	BB1_1;\n"
+	"\n"
+	"BB1_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar2_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r20, %r21, %r22, %r23}, [uchar2_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar2_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r28, %r29, %r30, %r31}, [uchar2_tex, {%f25, %f26}];\n"
+	"and.b32  	%r32, %r16, 255;\n"
+	"and.b32  	%r33, %r20, 255;\n"
+	"and.b32  	%r34, %r24, 255;\n"
+	"and.b32  	%r35, %r28, 255;\n"
+	"add.s32 	%r36, %r32, %r33;\n"
+	"add.s32 	%r37, %r36, %r34;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"and.b32  	%r41, %r17, 255;\n"
+	"and.b32  	%r42, %r21, 255;\n"
+	"and.b32  	%r43, %r25, 255;\n"
+	"and.b32  	%r44, %r29, 255;\n"
+	"add.s32 	%r45, %r41, %r42;\n"
+	"add.s32 	%r46, %r45, %r43;\n"
+	"add.s32 	%r47, %r46, %r44;\n"
+	"add.s32 	%r48, %r47, 2;\n"
+	"shr.u32 	%r49, %r48, 2;\n"
+	"mad.lo.s32 	%r50, %r2, %r6, %r1;\n"
+	"shl.b32 	%r51, %r50, 1;\n"
+	"add.s32 	%r52, %r15, %r51;\n"
+	"cvt.u16.u32	%rs1, %r49;\n"
+	"cvt.u16.u32	%rs2, %r40;\n"
+	"st.global.v2.u8 	[%r52], {%rs2, %rs1};\n"
+	"\n"
+	"BB1_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar4\n"
+	".visible .entry Subsample_Bilinear_uchar4(\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<5>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<71>;\n"
+	".reg .s64 	%rd<2>;\n"
+	"\n"
+	"\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar4_param_0];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar4_param_1];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar4_param_2];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar4_param_3];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar4_param_4];\n"
+	"ld.param.u32 	%r8, [Subsample_Bilinear_uchar4_param_5];\n"
+	"mov.u32 	%r9, %ctaid.x;\n"
+	"mov.u32 	%r10, %ntid.x;\n"
+	"mov.u32 	%r11, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r10, %r9, %r11;\n"
+	"mov.u32 	%r12, %ntid.y;\n"
+	"mov.u32 	%r13, %ctaid.y;\n"
+	"mov.u32 	%r14, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r12, %r13, %r14;\n"
+	"setp.lt.s32	%p1, %r2, %r5;\n"
+	"setp.lt.s32	%p2, %r1, %r4;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB2_2;\n"
+	"bra.uni 	BB2_1;\n"
+	"\n"
+	"BB2_1:\n"
+	"cvta.to.global.u32 	%r15, %r3;\n"
+	"cvt.rn.f32.s32	%f1, %r4;\n"
+	"cvt.rn.f32.s32	%f2, %r7;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r5;\n"
+	"cvt.rn.f32.s32	%f5, %r8;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r16, %r17, %r18, %r19}, [uchar4_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r20, %r21, %r22, %r23}, [uchar4_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar4_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r28, %r29, %r30, %r31}, [uchar4_tex, {%f25, %f26}];\n"
+	"and.b32  	%r32, %r16, 255;\n"
+	"and.b32  	%r33, %r20, 255;\n"
+	"and.b32  	%r34, %r24, 255;\n"
+	"and.b32  	%r35, %r28, 255;\n"
+	"add.s32 	%r36, %r32, %r33;\n"
+	"add.s32 	%r37, %r36, %r34;\n"
+	"add.s32 	%r38, %r37, %r35;\n"
+	"add.s32 	%r39, %r38, 2;\n"
+	"shr.u32 	%r40, %r39, 2;\n"
+	"and.b32  	%r41, %r17, 255;\n"
+	"and.b32  	%r42, %r21, 255;\n"
+	"and.b32  	%r43, %r25, 255;\n"
+	"and.b32  	%r44, %r29, 255;\n"
+	"add.s32 	%r45, %r41, %r42;\n"
+	"add.s32 	%r46, %r45, %r43;\n"
+	"add.s32 	%r47, %r46, %r44;\n"
+	"add.s32 	%r48, %r47, 2;\n"
+	"shr.u32 	%r49, %r48, 2;\n"
+	"and.b32  	%r50, %r18, 255;\n"
+	"and.b32  	%r51, %r22, 255;\n"
+	"and.b32  	%r52, %r26, 255;\n"
+	"and.b32  	%r53, %r30, 255;\n"
+	"add.s32 	%r54, %r50, %r51;\n"
+	"add.s32 	%r55, %r54, %r52;\n"
+	"add.s32 	%r56, %r55, %r53;\n"
+	"add.s32 	%r57, %r56, 2;\n"
+	"shr.u32 	%r58, %r57, 2;\n"
+	"and.b32  	%r59, %r19, 255;\n"
+	"and.b32  	%r60, %r23, 255;\n"
+	"and.b32  	%r61, %r27, 255;\n"
+	"and.b32  	%r62, %r31, 255;\n"
+	"add.s32 	%r63, %r59, %r60;\n"
+	"add.s32 	%r64, %r63, %r61;\n"
+	"add.s32 	%r65, %r64, %r62;\n"
+	"add.s32 	%r66, %r65, 2;\n"
+	"shr.u32 	%r67, %r66, 2;\n"
+	"mad.lo.s32 	%r68, %r2, %r6, %r1;\n"
+	"shl.b32 	%r69, %r68, 2;\n"
+	"add.s32 	%r70, %r15, %r69;\n"
+	"cvt.u16.u32	%rs1, %r67;\n"
+	"cvt.u16.u32	%rs2, %r58;\n"
+	"cvt.u16.u32	%rs3, %r49;\n"
+	"cvt.u16.u32	%rs4, %r40;\n"
+	"st.global.v4.u8 	[%r70], {%rs4, %rs3, %rs2, %rs1};\n"
+	"\n"
+	"BB2_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#elif defined ENVIRONMENT64
+const char resize_ptx[] = \
+	"//\n"
+	"// Generated by NVIDIA NVVM Compiler\n"
+	"//\n"
+	"// Compiler Build ID: CL-19324607\n"
+	"// Cuda compilation tools, release 7.0, V7.0.27\n"
+	"// Based on LLVM 3.4svn\n"
+	"//\n"
+	"\n"
+	".version 4.2\n"
+	".target sm_30\n"
+	".address_size 64\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar\n"
+	".global .texref uchar_tex;\n"
+	".global .texref uchar2_tex;\n"
+	".global .texref uchar4_tex;\n"
+	"\n"
+	".visible .entry Subsample_Bilinear_uchar(\n"
+	".param .u64 Subsample_Bilinear_uchar_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<40>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB0_2;\n"
+	"bra.uni 	BB0_1;\n"
+	"\n"
+	"BB0_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar_tex, {%f23, %f24}];\n"
+	"and.b32  	%r18, %r14, 255;\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r19, %r20, %r21, %r22}, [uchar_tex, {%f25, %f24}];\n"
+	"and.b32  	%r23, %r19, 255;\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r24, %r25, %r26, %r27}, [uchar_tex, {%f23, %f26}];\n"
+	"and.b32  	%r28, %r24, 255;\n"
+	"tex.2d.v4.u32.f32	{%r29, %r30, %r31, %r32}, [uchar_tex, {%f25, %f26}];\n"
+	"and.b32  	%r33, %r29, 255;\n"
+	"add.s32 	%r34, %r18, %r23;\n"
+	"add.s32 	%r35, %r34, %r28;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"mad.lo.s32 	%r39, %r2, %r5, %r1;\n"
+	"cvt.s64.s32	%rd4, %r39;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"st.global.u8 	[%rd5], %r38;\n"
+	"\n"
+	"BB0_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar2\n"
+	".visible .entry Subsample_Bilinear_uchar2(\n"
+	".param .u64 Subsample_Bilinear_uchar2_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar2_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<3>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<49>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar2_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar2_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar2_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar2_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar2_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar2_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB1_2;\n"
+	"bra.uni 	BB1_1;\n"
+	"\n"
+	"BB1_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar2_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r18, %r19, %r20, %r21}, [uchar2_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r22, %r23, %r24, %r25}, [uchar2_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar2_tex, {%f25, %f26}];\n"
+	"and.b32  	%r30, %r14, 255;\n"
+	"and.b32  	%r31, %r18, 255;\n"
+	"and.b32  	%r32, %r22, 255;\n"
+	"and.b32  	%r33, %r26, 255;\n"
+	"add.s32 	%r34, %r30, %r31;\n"
+	"add.s32 	%r35, %r34, %r32;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"and.b32  	%r39, %r15, 255;\n"
+	"and.b32  	%r40, %r19, 255;\n"
+	"and.b32  	%r41, %r23, 255;\n"
+	"and.b32  	%r42, %r27, 255;\n"
+	"add.s32 	%r43, %r39, %r40;\n"
+	"add.s32 	%r44, %r43, %r41;\n"
+	"add.s32 	%r45, %r44, %r42;\n"
+	"add.s32 	%r46, %r45, 2;\n"
+	"shr.u32 	%r47, %r46, 2;\n"
+	"mad.lo.s32 	%r48, %r2, %r5, %r1;\n"
+	"mul.wide.s32 	%rd4, %r48, 2;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"cvt.u16.u32	%rs1, %r47;\n"
+	"cvt.u16.u32	%rs2, %r38;\n"
+	"st.global.v2.u8 	[%rd5], {%rs2, %rs1};\n"
+	"\n"
+	"BB1_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"// .globl	Subsample_Bilinear_uchar4\n"
+	".visible .entry Subsample_Bilinear_uchar4(\n"
+	".param .u64 Subsample_Bilinear_uchar4_param_0,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_1,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_2,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_3,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_4,\n"
+	".param .u32 Subsample_Bilinear_uchar4_param_5\n"
+	")\n"
+	"{\n"
+	".reg .pred 	%p<4>;\n"
+	".reg .s16 	%rs<5>;\n"
+	".reg .f32 	%f<27>;\n"
+	".reg .s32 	%r<67>;\n"
+	".reg .s64 	%rd<6>;\n"
+	"\n"
+	"\n"
+	"ld.param.u64 	%rd1, [Subsample_Bilinear_uchar4_param_0];\n"
+	"ld.param.u32 	%r3, [Subsample_Bilinear_uchar4_param_1];\n"
+	"ld.param.u32 	%r4, [Subsample_Bilinear_uchar4_param_2];\n"
+	"ld.param.u32 	%r5, [Subsample_Bilinear_uchar4_param_3];\n"
+	"ld.param.u32 	%r6, [Subsample_Bilinear_uchar4_param_4];\n"
+	"ld.param.u32 	%r7, [Subsample_Bilinear_uchar4_param_5];\n"
+	"mov.u32 	%r8, %ctaid.x;\n"
+	"mov.u32 	%r9, %ntid.x;\n"
+	"mov.u32 	%r10, %tid.x;\n"
+	"mad.lo.s32 	%r1, %r9, %r8, %r10;\n"
+	"mov.u32 	%r11, %ntid.y;\n"
+	"mov.u32 	%r12, %ctaid.y;\n"
+	"mov.u32 	%r13, %tid.y;\n"
+	"mad.lo.s32 	%r2, %r11, %r12, %r13;\n"
+	"setp.lt.s32	%p1, %r2, %r4;\n"
+	"setp.lt.s32	%p2, %r1, %r3;\n"
+	"and.pred  	%p3, %p1, %p2;\n"
+	"@!%p3 bra 	BB2_2;\n"
+	"bra.uni 	BB2_1;\n"
+	"\n"
+	"BB2_1:\n"
+	"cvta.to.global.u64 	%rd2, %rd1;\n"
+	"cvt.rn.f32.s32	%f1, %r3;\n"
+	"cvt.rn.f32.s32	%f2, %r6;\n"
+	"div.rn.f32 	%f3, %f2, %f1;\n"
+	"cvt.rn.f32.s32	%f4, %r4;\n"
+	"cvt.rn.f32.s32	%f5, %r7;\n"
+	"div.rn.f32 	%f6, %f5, %f4;\n"
+	"cvt.rn.f32.s32	%f7, %r1;\n"
+	"add.f32 	%f8, %f7, 0f3F000000;\n"
+	"mul.f32 	%f9, %f8, %f3;\n"
+	"cvt.rn.f32.s32	%f10, %r2;\n"
+	"add.f32 	%f11, %f10, 0f3F000000;\n"
+	"mul.f32 	%f12, %f11, %f6;\n"
+	"add.f32 	%f13, %f3, 0fBF800000;\n"
+	"mul.f32 	%f14, %f13, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f15, %f14;\n"
+	"add.f32 	%f16, %f6, 0fBF800000;\n"
+	"mul.f32 	%f17, %f16, 0f3F000000;\n"
+	"cvt.sat.f32.f32	%f18, %f17;\n"
+	"add.f32 	%f19, %f15, 0f3F000000;\n"
+	"div.rn.f32 	%f20, %f15, %f19;\n"
+	"add.f32 	%f21, %f18, 0f3F000000;\n"
+	"div.rn.f32 	%f22, %f18, %f21;\n"
+	"sub.f32 	%f23, %f9, %f20;\n"
+	"sub.f32 	%f24, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r14, %r15, %r16, %r17}, [uchar4_tex, {%f23, %f24}];\n"
+	"add.f32 	%f25, %f9, %f20;\n"
+	"tex.2d.v4.u32.f32	{%r18, %r19, %r20, %r21}, [uchar4_tex, {%f25, %f24}];\n"
+	"add.f32 	%f26, %f12, %f22;\n"
+	"tex.2d.v4.u32.f32	{%r22, %r23, %r24, %r25}, [uchar4_tex, {%f23, %f26}];\n"
+	"tex.2d.v4.u32.f32	{%r26, %r27, %r28, %r29}, [uchar4_tex, {%f25, %f26}];\n"
+	"and.b32  	%r30, %r14, 255;\n"
+	"and.b32  	%r31, %r18, 255;\n"
+	"and.b32  	%r32, %r22, 255;\n"
+	"and.b32  	%r33, %r26, 255;\n"
+	"add.s32 	%r34, %r30, %r31;\n"
+	"add.s32 	%r35, %r34, %r32;\n"
+	"add.s32 	%r36, %r35, %r33;\n"
+	"add.s32 	%r37, %r36, 2;\n"
+	"shr.u32 	%r38, %r37, 2;\n"
+	"and.b32  	%r39, %r15, 255;\n"
+	"and.b32  	%r40, %r19, 255;\n"
+	"and.b32  	%r41, %r23, 255;\n"
+	"and.b32  	%r42, %r27, 255;\n"
+	"add.s32 	%r43, %r39, %r40;\n"
+	"add.s32 	%r44, %r43, %r41;\n"
+	"add.s32 	%r45, %r44, %r42;\n"
+	"add.s32 	%r46, %r45, 2;\n"
+	"shr.u32 	%r47, %r46, 2;\n"
+	"and.b32  	%r48, %r16, 255;\n"
+	"and.b32  	%r49, %r20, 255;\n"
+	"and.b32  	%r50, %r24, 255;\n"
+	"and.b32  	%r51, %r28, 255;\n"
+	"add.s32 	%r52, %r48, %r49;\n"
+	"add.s32 	%r53, %r52, %r50;\n"
+	"add.s32 	%r54, %r53, %r51;\n"
+	"add.s32 	%r55, %r54, 2;\n"
+	"shr.u32 	%r56, %r55, 2;\n"
+	"and.b32  	%r57, %r17, 255;\n"
+	"and.b32  	%r58, %r21, 255;\n"
+	"and.b32  	%r59, %r25, 255;\n"
+	"and.b32  	%r60, %r29, 255;\n"
+	"add.s32 	%r61, %r57, %r58;\n"
+	"add.s32 	%r62, %r61, %r59;\n"
+	"add.s32 	%r63, %r62, %r60;\n"
+	"add.s32 	%r64, %r63, 2;\n"
+	"shr.u32 	%r65, %r64, 2;\n"
+	"mad.lo.s32 	%r66, %r2, %r5, %r1;\n"
+	"mul.wide.s32 	%rd4, %r66, 4;\n"
+	"add.s64 	%rd5, %rd2, %rd4;\n"
+	"cvt.u16.u32	%rs1, %r65;\n"
+	"cvt.u16.u32	%rs2, %r56;\n"
+	"cvt.u16.u32	%rs3, %r47;\n"
+	"cvt.u16.u32	%rs4, %r38;\n"
+	"st.global.v4.u8 	[%rd5], {%rs4, %rs3, %rs2, %rs1};\n"
+	"\n"
+	"BB2_2:\n"
+	"ret;\n"
+	"}\n"
+	"\n"
+	"\n"
+;
+#endif