diff --git a/patches/0004-FFVA-v0.5-release-patches.patch b/patches/0004-FFVA-v0.5-release-patches.patch
new file mode 100644
index 0000000..2ab26d6
--- /dev/null
+++ b/patches/0004-FFVA-v0.5-release-patches.patch
@@ -0,0 +1,2919 @@
+From 2f4fc35a57f05f9d8c74861bc3b248dba54d03f4 Mon Sep 17 00:00:00 2001
+From: "Xie, Lin" <lin.xie@intel.com>
+Date: Mon, 10 Feb 2020 22:03:02 -0800
+Subject: [PATCH] FFVA v0.5 release patches
+
+* Add a generic infer filter
+* Support faster RCNN model
+* Introduce a new filter 'python'
+* Ochestra - component latency
+* Add i420 format support
+* Release the second model input name
+* Support CSRNet
+* To use openvino official C API
+* Fix compile failed when ie c api not enabled
+---
+ configure                                     |  24 +-
+ fftools/ffmpeg.c                              |  35 ++
+ libavcodec/avcodec.h                          |   2 +
+ libavcodec/decode.c                           |   8 +
+ libavcodec/encode.c                           |   6 +
+ libavcodec/pthread_frame.c                    |   8 +
+ libavfilter/Makefile                          |  20 +-
+ libavfilter/allfilters.c                      |   2 +
+ libavfilter/avfilter.c                        |   4 +
+ libavfilter/avfilter.h                        |   2 +-
+ libavfilter/framequeue.c                      |   6 +
+ libavfilter/inference_backend/Makefile        |  17 +
+ .../inference_backend/ff_base_inference.h     |  10 +-
+ .../inference_backend/ff_inference_impl.c     | 103 ++----
+ .../inference_backend/ff_proc_factory.c       |  99 ++++-
+ .../inference_backend/image_inference.h       |   2 +-
+ .../image_inference_async_preproc.c           |  18 +-
+ libavfilter/inference_backend/metaconverter.h |   2 +-
+ libavfilter/inference_backend/model_proc.c    |  19 +-
+ libavfilter/inference_backend/model_proc.h    |   4 +-
+ .../openvino_image_inference.c                | 320 ++++++++---------
+ .../openvino_image_inference.h                |  10 +-
+ libavfilter/inference_backend/pre_proc.c      |  32 +-
+ libavfilter/inference_backend/pre_proc.h      |  10 +-
+ libavfilter/inference_backend/wrap_image.c    | 138 +++++++
+ libavfilter/inference_backend/wrap_image.h    |  27 ++
+ libavfilter/vf_inference_classify.c           |   1 -
+ libavfilter/vf_inference_detect.c             |   1 -
+ libavfilter/vf_inference_infer.c              | 337 ++++++++++++++++++
+ libavfilter/vf_inference_python.c             | 334 +++++++++++++++++
+ libavutil/frame.c                             |   5 +-
+ libavutil/frame.h                             |   4 +
+ python/ffmpeg/__init__.py                     |   7 +
+ python/ffmpeg/avutil.py                       |  15 +
+ python/ffmpeg/ffmpeg_decls.py                 | 192 ++++++++++
+ python/ffmpeg/video_frame.py                  | 122 +++++++
+ 36 files changed, 1621 insertions(+), 325 deletions(-)
+ create mode 100644 libavfilter/inference_backend/Makefile
+ create mode 100755 libavfilter/inference_backend/wrap_image.c
+ create mode 100755 libavfilter/inference_backend/wrap_image.h
+ create mode 100755 libavfilter/vf_inference_infer.c
+ create mode 100755 libavfilter/vf_inference_python.c
+ create mode 100755 python/ffmpeg/__init__.py
+ create mode 100755 python/ffmpeg/avutil.py
+ create mode 100755 python/ffmpeg/ffmpeg_decls.py
+ create mode 100755 python/ffmpeg/video_frame.py
+
+diff --git a/configure b/configure
+index 04df3016ab..785989afbd 100755
+--- a/configure
++++ b/configure
+@@ -240,7 +240,7 @@ External library support:
+   --enable-libgsm          enable GSM de/encoding via libgsm [no]
+   --enable-libiec61883     enable iec61883 via libiec61883 [no]
+   --enable-libilbc         enable iLBC de/encoding via libilbc [no]
+-  --enable-libinference_engine_c_wrapper enable dldt inference engine c wrapper [no]
++  --enable-libinference_engine_c_api enable dldt inference engine c api [no]
+   --enable-libjack         enable JACK audio sound server [no]
+   --enable-libjson_c       enable libjson-c [no]
+   --enable-libklvanc       enable Kernel Labs VANC processing [no]
+@@ -311,6 +311,7 @@ External library support:
+   --enable-openssl         enable openssl, needed for https support
+                            if gnutls, libtls or mbedtls is not used [no]
+   --enable-pocketsphinx    enable PocketSphinx, needed for asr filter [no]
++  --enable-python3         enable python3 libs, needed for inference python filter [no]
+   --disable-sndio          disable sndio support [autodetect]
+   --disable-schannel       disable SChannel SSP, needed for TLS support on
+                            Windows if openssl and gnutls are not used [autodetect]
+@@ -1775,7 +1776,7 @@ EXTERNAL_LIBRARY_LIST="
+     libgsm
+     libiec61883
+     libilbc
+-    libinference_engine_c_wrapper
++    libinference_engine_c_api
+     libjack
+     libjson_c
+     libklvanc
+@@ -1816,6 +1817,7 @@ EXTERNAL_LIBRARY_LIST="
+     openal
+     opengl
+     pocketsphinx
++    python3
+     vapoursynth
+     librdkafka
+ "
+@@ -2610,7 +2612,7 @@ cbs_vp9_select="cbs"
+ dct_select="rdft"
+ dirac_parse_select="golomb"
+ dnn_suggest="libtensorflow"
+-image_inference_suggest="libinference_engine_c_wrapper"
++image_inference_suggest="libinference_engine_c_api"
+ image_inference_deps="libjson_c"
+ error_resilience_select="me_cmp"
+ faandct_deps="faan"
+@@ -3482,12 +3484,16 @@ geq_filter_deps="gpl"
+ histeq_filter_deps="gpl"
+ hqdn3d_filter_deps="gpl"
+ inference_identify_filter_deps="libjson_c"
+-inference_identify_filter_select="dnn"
++inference_identify_filter_select="image_inference"
+ inference_metaconvert_filter_deps="libjson_c"
+-inference_classify_filter_deps="libinference_engine_c_wrapper libjson_c"
++inference_metaconvert_filter_select="image_inference"
++inference_python_filter_deps="python3"
++inference_classify_filter_deps="libinference_engine_c_api libjson_c"
+ inference_classify_filter_select="image_inference"
+-inference_detect_filter_deps="libinference_engine_c_wrapper libjson_c"
++inference_detect_filter_deps="libinference_engine_c_api libjson_c"
+ inference_detect_filter_select="image_inference"
++inference_infer_filter_deps="libinference_engine_c_api libjson_c"
++inference_infer_filter_select="image_inference"
+ interlace_filter_deps="gpl"
+ kerndeint_filter_deps="gpl"
+ ladspa_filter_deps="ladspa libdl"
+@@ -6389,8 +6395,10 @@ enabled librdkafka  && require_pkg_config librdkafka rdkafka "librdkafka/rdkafka
+ 
+ enabled libjson_c && check_pkg_config libjson_c json-c json-c/json.h json_c_version
+ 
+-enabled libinference_engine_c_wrapper &&
+-    require_pkg_config libinference_engine_c_wrapper dldt_c_api "ie_c_api.h" ie_c_api_version
++enabled python3 && require_pkg_config python3 python-3.6  Python.h Py_Initialize
++
++enabled libinference_engine_c_api &&
++    require libinference_engine_c_api c_api/ie_c_api.h ie_c_api_version -linference_engine_c_api
+ 
+ if enabled gcrypt; then
+     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
+index 027bd58e40..fac0018ce7 100755
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -1892,6 +1892,39 @@ static void print_report(int is_last_report, int64_t timer_start, int64_t cur_ti
+         total_fps = t > 1 ? total_frames_num / (t - init_time / 1000000.0 ): 0;
+         if (total_fps > 0)
+             av_bprintf(&buf, " fps without filter init=%.2f |", total_fps);
++
++        av_bprintf(&buf, " latency(ms):");
++        for (i = 0; i < nb_input_streams; i++) {
++            InputStream *ist = input_streams[i];
++            if (ist->dec_ctx->codec_type == AVMEDIA_TYPE_VIDEO && ist->frames_decoded) {
++                av_bprintf(&buf, " dec_%s=%.2f ", ist->dec_ctx->codec->name,
++                        (ist->dec_ctx->decode_latency / 1000.0) / ist->frames_decoded);
++            }
++        }
++
++        for (i = 0; i < nb_filtergraphs; i++) {
++            FilterGraph *fg = filtergraphs[i];
++            if (!fg || !fg->graph)
++                continue;
++            for (int j = 0; j < fg->graph->nb_filters; j++) {
++                AVFilterContext *ft = fg->graph->filters[j];
++                if (!ft || !ft->outputs)
++                    continue;
++                if (ft->outputs[0]->frame_count_in && strncmp(ft->name, "Parsed", 6) == 0) {
++                    float lt = (ft->filter_latency / 1000.0) / (ft->outputs[0]->frame_count_in);
++                    if (lt != 0)
++                        av_bprintf(&buf, " %s=%.2f ", &ft->name[7], lt);
++                }
++            }
++        }
++        for (i = 0; i < nb_output_streams; i++) {
++            OutputStream *ost = output_streams[i];
++            if (ost->enc_ctx->codec_type == AVMEDIA_TYPE_VIDEO && ost->enc_ctx->frame_number > 1) {
++                av_bprintf(&buf, "enc_%s=%.2f ", ost->enc_ctx->codec->name,
++                        (ost->enc_ctx->encode_latency / 1000.0) / ost->enc_ctx->frame_number);
++            }
++        }
++        av_bprintf(&buf, "|");
+     }
+ 
+     secs = FFABS(pts) / AV_TIME_BASE;
+@@ -2713,6 +2746,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+     if (ist->st->sample_aspect_ratio.num)
+         decoded_frame->sample_aspect_ratio = ist->st->sample_aspect_ratio;
+ 
++    if (do_profiling_all)
++        ist->dec_ctx->decode_latency += decoded_frame->tm_out - decoded_frame->tm_in;
+     err = send_frame_to_filters(ist, decoded_frame);
+ 
+ fail:
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index d234271c5b..f0b36d77ef 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -3370,6 +3370,8 @@ typedef struct AVCodecContext {
+      * - encoding: unused
+      */
+     int discard_damaged_percentage;
++
++    uint64_t decode_latency, encode_latency;
+ } AVCodecContext;
+ 
+ #if FF_API_CODEC_GET_SET
+diff --git a/libavcodec/decode.c b/libavcodec/decode.c
+index 6c31166ec2..3f155fcce0 100644
+--- a/libavcodec/decode.c
++++ b/libavcodec/decode.c
+@@ -37,6 +37,7 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/intmath.h"
+ #include "libavutil/opt.h"
++#include "libavutil/time.h"
+ 
+ #include "avcodec.h"
+ #include "bytestream.h"
+@@ -407,6 +408,7 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
+     // copy to ensure we do not change pkt
+     int got_frame, actual_got_frame;
+     int ret;
++    uint64_t tm_start;
+ 
+     if (!pkt->data && !avci->draining) {
+         av_packet_unref(pkt);
+@@ -430,7 +432,13 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
+     if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME) {
+         ret = ff_thread_decode_frame(avctx, frame, &got_frame, pkt);
+     } else {
++        if (av_profiling_get())
++            tm_start = av_gettime();
+         ret = avctx->codec->decode(avctx, frame, &got_frame, pkt);
++        if (av_profiling_get() && got_frame && frame) {
++            frame->tm_in = tm_start;
++            frame->tm_out = av_gettime();
++        }
+ 
+         if (!(avctx->codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS))
+             frame->pkt_dts = pkt->dts;
+diff --git a/libavcodec/encode.c b/libavcodec/encode.c
+index d12c42526b..4e81694d17 100644
+--- a/libavcodec/encode.c
++++ b/libavcodec/encode.c
+@@ -24,6 +24,7 @@
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
+ #include "libavutil/samplefmt.h"
++#include "libavutil/time.h"
+ 
+ #include "avcodec.h"
+ #include "frame_thread_encoder.h"
+@@ -263,6 +264,7 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
+     int ret;
+     AVPacket user_pkt = *avpkt;
+     int needs_realloc = !user_pkt.data;
++    uint64_t tm_start;
+ 
+     *got_packet_ptr = 0;
+ 
+@@ -293,7 +295,11 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
+ 
+     av_assert0(avctx->codec->encode2);
+ 
++    if (av_profiling_get())
++        tm_start = av_gettime();
+     ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
++    if (av_profiling_get())
++        avctx->encode_latency += av_gettime() - tm_start;
+     av_assert0(ret <= 0);
+ 
+     emms_c();
+diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
+index 36ac0ac1e5..9919c5ac38 100644
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -44,6 +44,7 @@
+ #include "libavutil/mem.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/thread.h"
++#include "libavutil/time.h"
+ 
+ enum {
+     ///< Set when the thread is awaiting a packet.
+@@ -170,6 +171,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+     PerThreadContext *p = arg;
+     AVCodecContext *avctx = p->avctx;
+     const AVCodec *codec = avctx->codec;
++    uint64_t tm_start;
+ 
+     pthread_mutex_lock(&p->mutex);
+     while (1) {
+@@ -198,6 +200,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+ 
+         av_frame_unref(p->frame);
+         p->got_frame = 0;
++        if (av_profiling_get())
++            tm_start = av_gettime();
+         p->result = codec->decode(avctx, p->frame, &p->got_frame, &p->avpkt);
+ 
+         if ((p->result < 0 || !p->got_frame) && p->frame->buf[0]) {
+@@ -226,6 +230,10 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+         atomic_store(&p->state, STATE_INPUT_READY);
+ 
+         pthread_cond_broadcast(&p->progress_cond);
++        if (av_profiling_get()) {
++            p->frame->tm_in = tm_start;
++            p->frame->tm_out = av_gettime();
++        }
+         pthread_cond_signal(&p->output_cond);
+         pthread_mutex_unlock(&p->progress_mutex);
+     }
+diff --git a/libavfilter/Makefile b/libavfilter/Makefile
+index 0ce29b0c3a..5a1339302d 100755
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -29,22 +29,8 @@ OBJS-$(CONFIG_QSVVPP)                        += qsvvpp.o
+ DNN-OBJS-$(CONFIG_LIBTENSORFLOW)             += dnn_backend_tf.o
+ OBJS-$(CONFIG_DNN)                           += dnn_interface.o dnn_backend_native.o $(DNN-OBJS-yes)
+ OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
+-OBJS-$(CONFIG_IMAGE_INFERENCE)               += inference_backend/ff_base_inference.o             \
+-                                                inference_backend/ff_inference_impl.o             \
+-                                                inference_backend/ff_list.o                       \
+-                                                inference_backend/ff_proc_factory.o               \
+-                                                inference_backend/image.o                         \
+-                                                inference_backend/image_inference.o               \
+-                                                inference_backend/image_inference_async_preproc.o \
+-                                                inference_backend/logger.o                        \
+-                                                inference_backend/model_proc.o                    \
+-                                                inference_backend/openvino_image_inference.o      \
+-                                                inference_backend/pre_proc.o                      \
+-                                                inference_backend/pre_proc_mocker.o               \
+-                                                inference_backend/pre_proc_swscale.o              \
+-                                                inference_backend/pre_proc_vaapi.o                \
+-                                                inference_backend/safe_queue.o                    \
+-                                                inference_backend/metaconverter.o                 \
++
++include $(SRC_PATH)/libavfilter/inference_backend/Makefile
+ 
+ # audio filters
+ OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
+@@ -287,8 +273,10 @@ OBJS-$(CONFIG_IDET_FILTER)                   += vf_idet.o
+ OBJS-$(CONFIG_IL_FILTER)                     += vf_il.o
+ OBJS-$(CONFIG_INFERENCE_IDENTIFY_FILTER)     += vf_inference_identify.o
+ OBJS-$(CONFIG_INFERENCE_METACONVERT_FILTER)  += vf_inference_metaconvert.o
++OBJS-$(CONFIG_INFERENCE_PYTHON_FILTER)       += vf_inference_python.o
+ OBJS-$(CONFIG_INFERENCE_CLASSIFY_FILTER)     += vf_inference_classify.o
+ OBJS-$(CONFIG_INFERENCE_DETECT_FILTER)       += vf_inference_detect.o
++OBJS-$(CONFIG_INFERENCE_INFER_FILTER)        += vf_inference_infer.o
+ OBJS-$(CONFIG_INFLATE_FILTER)                += vf_neighbor.o
+ OBJS-$(CONFIG_INTERLACE_FILTER)              += vf_tinterlace.o
+ OBJS-$(CONFIG_INTERLEAVE_FILTER)             += f_interleave.o
+diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
+index 31abaf39d2..e96e771729 100755
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -256,8 +256,10 @@ extern AVFilter ff_vf_idet;
+ extern AVFilter ff_vf_il;
+ extern AVFilter ff_vf_inference_identify;
+ extern AVFilter ff_vf_inference_metaconvert;
++extern AVFilter ff_vf_inference_python;
+ extern AVFilter ff_vf_inference_classify;
+ extern AVFilter ff_vf_inference_detect;
++extern AVFilter ff_vf_inference_infer;
+ extern AVFilter ff_vf_inflate;
+ extern AVFilter ff_vf_interlace;
+ extern AVFilter ff_vf_interleave;
+diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
+index 8d8a42c67c..a27489d08d 100755
+--- a/libavfilter/avfilter.c
++++ b/libavfilter/avfilter.c
+@@ -1133,6 +1133,10 @@ int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
+         av_frame_free(&frame);
+         return ret;
+     }
++    if (av_profiling_get()) {
++        if (frame->tm_in != 0 && ((frame->width && frame->height) || !frame->nb_samples))
++            link->src->filter_latency += frame->tm_out - frame->tm_in;
++    }
+     ff_filter_set_ready(link->dst, 300);
+     return 0;
+ 
+diff --git a/libavfilter/avfilter.h b/libavfilter/avfilter.h
+index 7545883367..7abb7c29d3 100755
+--- a/libavfilter/avfilter.h
++++ b/libavfilter/avfilter.h
+@@ -423,7 +423,7 @@ struct AVFilterContext {
+      */
+     int extra_hw_frames;
+ 
+-    int64_t last_tm, init_working_time, sum_working_time;
++    uint64_t last_tm, init_working_time, sum_working_time, filter_latency;
+ };
+ 
+ /**
+diff --git a/libavfilter/framequeue.c b/libavfilter/framequeue.c
+index fed1118975..4b37d14cc9 100644
+--- a/libavfilter/framequeue.c
++++ b/libavfilter/framequeue.c
+@@ -21,6 +21,7 @@
+ 
+ #include "libavutil/avassert.h"
+ #include "framequeue.h"
++#include "libavutil/time.h"
+ 
+ static inline FFFrameBucket *bucket(FFFrameQueue *fq, size_t idx)
+ {
+@@ -86,8 +87,11 @@ int ff_framequeue_add(FFFrameQueue *fq, AVFrame *frame)
+             fq->allocated = na;
+         }
+     }
++    if (av_profiling_get() && frame)
++        frame->tm_out = av_gettime();
+     b = bucket(fq, fq->queued);
+     b->frame = frame;
++
+     fq->queued++;
+     fq->total_frames_head++;
+     fq->total_samples_head += frame->nb_samples;
+@@ -109,6 +113,8 @@ AVFrame *ff_framequeue_take(FFFrameQueue *fq)
+     fq->total_samples_tail += b->frame->nb_samples;
+     fq->samples_skipped = 0;
+     check_consistency(fq);
++    if (av_profiling_get())
++        b->frame->tm_in = av_gettime();
+     return b->frame;
+ }
+ 
+diff --git a/libavfilter/inference_backend/Makefile b/libavfilter/inference_backend/Makefile
+new file mode 100644
+index 0000000000..66be537875
+--- /dev/null
++++ b/libavfilter/inference_backend/Makefile
+@@ -0,0 +1,17 @@
++OBJS-$(CONFIG_IMAGE_INFERENCE)               += inference_backend/ff_base_inference.o             \
++                                                inference_backend/ff_inference_impl.o             \
++                                                inference_backend/ff_list.o                       \
++                                                inference_backend/ff_proc_factory.o               \
++                                                inference_backend/image.o                         \
++                                                inference_backend/image_inference.o               \
++                                                inference_backend/image_inference_async_preproc.o \
++                                                inference_backend/logger.o                        \
++                                                inference_backend/model_proc.o                    \
++                                                inference_backend/openvino_image_inference.o      \
++                                                inference_backend/pre_proc.o                      \
++                                                inference_backend/pre_proc_mocker.o               \
++                                                inference_backend/pre_proc_swscale.o              \
++                                                inference_backend/pre_proc_vaapi.o                \
++                                                inference_backend/safe_queue.o                    \
++                                                inference_backend/metaconverter.o                 \
++                                                inference_backend/wrap_image.o                    \
+diff --git a/libavfilter/inference_backend/ff_base_inference.h b/libavfilter/inference_backend/ff_base_inference.h
+index c1cbf7e249..30101275eb 100644
+--- a/libavfilter/inference_backend/ff_base_inference.h
++++ b/libavfilter/inference_backend/ff_base_inference.h
+@@ -61,7 +61,6 @@ typedef struct __ModelOutputPostproc ModelOutputPostproc;
+ 
+ #define FF_INFERENCE_OPTIONS                                                                                           \
+     char *model;                                                                                                       \
+-    char *object_class;                                                                                                \
+     char *model_proc;                                                                                                  \
+     char *device;                                                                                                      \
+     int batch_size;                                                                                                    \
+@@ -181,6 +180,15 @@ typedef struct _InferDetectionMeta {
+     BBoxesArray *bboxes;
+ } InferDetectionMeta;
+ 
++typedef struct _TensorsArray {
++    IETensorMeta **tensors;
++    int num;
++} TensorsArray;
++
++typedef struct _InferTensorMeta {
++    TensorsArray *t_array;
++} InferTensorMeta;
++
+ typedef struct __InferenceROI {
+     AVFrame *frame;
+     FFVideoRegionOfInterestMeta roi;
+diff --git a/libavfilter/inference_backend/ff_inference_impl.c b/libavfilter/inference_backend/ff_inference_impl.c
+index 5a0e91808b..90b19dd8fa 100644
+--- a/libavfilter/inference_backend/ff_inference_impl.c
++++ b/libavfilter/inference_backend/ff_inference_impl.c
+@@ -37,10 +37,8 @@ typedef enum {
+ 
+ typedef struct __Model {
+     const char *name;
+-    char *object_class;
+     ImageInferenceContext *infer_ctx;
+     FFInferenceImpl *infer_impl;
+-    // std::map<std::string, void *> proc;
+     void *input_preproc;
+ 
+     void *proc_config;
+@@ -79,41 +77,6 @@ struct __FFInferenceImpl {
+     ff_list_t *processed_frames; // TODO: consider remove it if all output frames can be consumed instantly
+ };
+ 
+-static void SplitString(char *str, const char *delim, char **array, int *num, int max) {
+-    char *p;
+-    int i = 0;
+-
+-    if (!str || !delim || !array || !num)
+-        return;
+-
+-    while (p = strtok(str, delim)) {
+-        int j = 0;
+-        char *s;
+-        size_t end;
+-
+-        /* remove head blanks */
+-        while (p[j] == '\n' || p[j] == ' ')
+-            j++;
+-
+-        if (!p[j])
+-            continue;
+-
+-        /* remove tail blanks */
+-        s = p + j;
+-        end = strlen(s) - 1;
+-        while (s[end] == '\n' || s[end] == ' ')
+-            s[end--] = '\0';
+-
+-        array[i++] = s;
+-        av_assert0(i < max);
+-
+-        /* string is cached */
+-        str = NULL;
+-    }
+-
+-    *num = i;
+-}
+-
+ static inline int avFormatToFourCC(int format) {
+     switch (format) {
+     case AV_PIX_FMT_NV12:
+@@ -259,19 +222,13 @@ static int ConfigPreProc(FFBaseInference *base, FFInferenceImpl *impl) {
+ 
+     // Create async pre_proc image inference backend
+     if (base->param.opaque) {
+-        PreProcContext *preproc_ctx = NULL;
+         ImageInferenceContext *async_preproc_ctx = NULL;
+-
+         const ImageInference *inference = image_inference_get_by_name("async_preproc");
+         async_preproc_ctx = image_inference_alloc(inference, NULL, "async-preproc-infer");
+-        if (base->param.vpp_device == VPP_DEVICE_HW)
+-            preproc_ctx = pre_proc_alloc(pre_proc_get_by_name("vaapi"));
+-        else
+-            preproc_ctx = pre_proc_alloc(pre_proc_get_by_name("mocker"));
+ 
+-        av_assert0(async_preproc_ctx && preproc_ctx);
++        av_assert0(async_preproc_ctx);
+ 
+-        async_preproc_ctx->inference->CreateAsyncPreproc(async_preproc_ctx, context, preproc_ctx, 6,
++        async_preproc_ctx->inference->CreateAsyncPreproc(async_preproc_ctx, context, 6, base->param.vpp_device,
+                                                          base->param.opaque);
+ 
+         // substitute for opevino image inference
+@@ -284,8 +241,27 @@ static int ConfigPreProc(FFBaseInference *base, FFInferenceImpl *impl) {
+     return 0;
+ }
+ 
+-static Model *CreateModel(FFBaseInference *base, const char *model_file, const char *model_proc_path,
+-                          const char *object_class) {
++static void *ParseModelProc(Model *model, const char *model_proc_path) {
++    void *proc = model_proc_read_config_file(model_proc_path);
++    if (!proc) {
++        VAII_LOGE("Could not read proc config file:"
++                  "%s\n",
++                  model_proc_path);
++        av_assert0(proc);
++    }
++
++    if (model_proc_parse_input_preproc(proc, &model->model_preproc) < 0) {
++        VAII_WARNING("Parse input preproc error.\n");
++    }
++
++    if (model_proc_parse_output_postproc(proc, &model->model_postproc) < 0) {
++        VAII_WARNING("Parse output postproc error.\n");
++    }
++
++    return proc;
++}
++
++static Model *CreateModel(FFBaseInference *base, const char *model_file, const char *model_proc_path) {
+     int ret = 0;
+     Model *model = NULL;
+     const ImageInference *inference = image_inference_get_by_name("openvino");
+@@ -300,23 +276,7 @@ static Model *CreateModel(FFBaseInference *base, const char *model_file, const c
+     av_assert0(context && model);
+ 
+     if (model_proc_path) {
+-        void *proc = model_proc_read_config_file(model_proc_path);
+-        if (!proc) {
+-            VAII_LOGE("Could not read proc config file:"
+-                   "%s\n",
+-                   model_proc_path);
+-            av_assert0(proc);
+-        }
+-
+-        if (model_proc_parse_input_preproc(proc, &model->model_preproc) < 0) {
+-            VAII_ERROR("Parse input preproc error.\n");
+-        }
+-
+-        if (model_proc_parse_output_postproc(proc, &model->model_postproc) < 0) {
+-            VAII_ERROR("Parse output postproc error.\n");
+-        }
+-
+-        model->proc_config = proc;
++        model->proc_config = ParseModelProc(model, model_proc_path);
+     }
+ 
+     ret = context->inference->Create(context, MEM_TYPE_ANY, base->param.device, model_file, base->param.batch_size,
+@@ -325,7 +285,6 @@ static Model *CreateModel(FFBaseInference *base, const char *model_file, const c
+ 
+     model->infer_ctx = context;
+     model->name = context->inference->GetModelName(context);
+-    model->object_class = object_class ? av_strdup(object_class) : NULL;
+     model->input_preproc = NULL;
+ 
+     return model;
+@@ -342,8 +301,6 @@ static void ReleaseModel(Model *model) {
+ 
+     model_proc_release_model_proc(model->proc_config, &model->model_preproc, &model->model_postproc);
+ 
+-    if (model->object_class)
+-        av_free(model->object_class);
+     av_free(model);
+ }
+ 
+@@ -396,8 +353,8 @@ FFInferenceImpl *FFInferenceImplCreate(FFBaseInference *ff_base_inference) {
+ 
+     av_assert0(impl && ff_base_inference && ff_base_inference->param.model);
+ 
+-    dnn_model = CreateModel(ff_base_inference, ff_base_inference->param.model, ff_base_inference->param.model_proc,
+-                            ff_base_inference->param.object_class);
++    dnn_model = CreateModel(ff_base_inference, ff_base_inference->param.model, ff_base_inference->param.model_proc);
++
+     dnn_model->infer_impl = impl;
+ 
+     impl->model = dnn_model;
+@@ -414,9 +371,11 @@ FFInferenceImpl *FFInferenceImplCreate(FFBaseInference *ff_base_inference) {
+ }
+ 
+ int FFInferenceImplSetParams(FFBaseInference *ff_base_inference) {
+-    av_assert0(ff_base_inference);
+-    FFInferenceImpl *impl = (FFInferenceImpl *)ff_base_inference->inference;
+-    av_assert0(impl);
++    FFInferenceImpl *impl;
++
++    av_assert0(ff_base_inference && ff_base_inference->inference);
++
++    impl = (FFInferenceImpl *)ff_base_inference->inference;
+ 
+     // here currently mainly about preproc
+     ConfigPreProc(ff_base_inference, impl);
+diff --git a/libavfilter/inference_backend/ff_proc_factory.c b/libavfilter/inference_backend/ff_proc_factory.c
+index ded8a5bc88..2fd140c7b5 100755
+--- a/libavfilter/inference_backend/ff_proc_factory.c
++++ b/libavfilter/inference_backend/ff_proc_factory.c
+@@ -30,6 +30,7 @@ struct _precision {
+     IEPrecision value;
+     const char *str;
+ };
++
+ static struct _precision precision_table[] = {
+     ENUM_STRING_PAIR(FP32),
+     ENUM_STRING_PAIR(U8),
+@@ -39,6 +40,7 @@ struct _layout {
+     IELayout value;
+     const char *str;
+ };
++
+ static struct _layout layout_table[] = {
+     ENUM_STRING_PAIR(ANY),
+     ENUM_STRING_PAIR(NCHW),
+@@ -103,6 +105,25 @@ static void infer_classify_metadata_buffer_free(void *opaque, uint8_t *data) {
+     av_free(data);
+ }
+ 
++static void infer_tensor_metadata_buffer_free(void *opaque, uint8_t *data) {
++    TensorsArray *t_array = ((InferTensorMeta *)data)->t_array;
++
++    if (t_array) {
++        int i;
++        for (i = 0; i < t_array->num; i++) {
++            IETensorMeta *p = t_array->tensors[i];
++            av_buffer_unref(&p->buffer);
++            if (p->layer_name)
++                av_freep(&p->layer_name);
++            av_freep(&p);
++        }
++        av_free(t_array->tensors);
++        av_freep(&t_array);
++    }
++
++    av_free(data);
++}
++
+ static inline void enhanced_face_bounding_box(FFVideoRegionOfInterestMeta *roi) {
+     const float bb_enlarge_coefficient = 1.2;
+     const float bb_dx_coefficient = 1.0;
+@@ -133,8 +154,9 @@ static void inline fill_tensor_metadata(IETensorMeta *tensor, const char *layer_
+     tensor->precision = get_precision_string(precision);
+     tensor->layout = get_layout_string(layout);
+     tensor->ranks = ranks;
+-    for (int i = 0; i < ranks; i++)
++    for (int i = 0; i < ranks; i++) {
+         tensor->dims[i] = dims[i];
++    }
+     tensor->layer_name = strdup(layer_name);
+     tensor->model_name = model_name;
+     if (data) {
+@@ -407,9 +429,9 @@ static void ExtractYOLOV3BoundingBoxes(const OutputBlobArray *blob_array, Infere
+     av_free(obj_array.objects);
+ }
+ 
+-static void ExtractBoundingBoxes(const OutputBlobArray *blob_array, InferenceROIArray *infer_roi_array,
+-                                 ModelOutputPostproc *model_postproc, const char *model_name,
+-                                 const FFBaseInference *ff_base_inference) {
++static void ExtractSSDBoundingBoxes(const OutputBlobArray *blob_array, InferenceROIArray *infer_roi_array,
++                                    ModelOutputPostproc *model_postproc, const char *model_name,
++                                    const FFBaseInference *ff_base_inference) {
+     for (int n = 0; n < blob_array->num_blobs; n++) {
+         AVBufferRef *labels = NULL;
+         BBoxesArray **boxes = NULL;
+@@ -689,9 +711,9 @@ static int tensor_to_text(FFVideoRegionOfInterestMeta *meta, OutputPostproc *pos
+     return 0;
+ }
+ 
+-static void Blob2RoiMeta(const OutputBlobArray *blob_array, InferenceROIArray *infer_roi_array,
+-                         ModelOutputPostproc *model_postproc, const char *model_name,
+-                         const FFBaseInference *ff_base_inference) {
++static void ExtractClassifyResults(const OutputBlobArray *blob_array, InferenceROIArray *infer_roi_array,
++                                   ModelOutputPostproc *model_postproc, const char *model_name,
++                                   const FFBaseInference *ff_base_inference) {
+     int batch_size = infer_roi_array->num_infer_ROIs;
+ 
+     for (int n = 0; n < blob_array->num_blobs; n++) {
+@@ -772,6 +794,63 @@ static void Blob2RoiMeta(const OutputBlobArray *blob_array, InferenceROIArray *i
+     }
+ }
+ 
++static void ExtractInferResults(const OutputBlobArray *blob_array, InferenceROIArray *infer_roi_array,
++                                ModelOutputPostproc *model_postproc, const char *model_name,
++                                const FFBaseInference *ff_base_inference) {
++    int batch_size = infer_roi_array->num_infer_ROIs;
++
++    for (int n = 0; n < blob_array->num_blobs; n++) {
++        OutputBlobContext *ctx = blob_array->output_blobs[n];
++        const OutputBlobMethod *blob = ctx->output_blob_method;
++
++        const char *layer_name = blob->GetOutputLayerName(ctx);
++        const uint8_t *data = (const uint8_t *)blob->GetData(ctx);
++
++        Dimensions dim = blob->GetDims(ctx);
++        IILayout layout = blob->GetLayout(ctx);
++        IEPrecision precision = blob->GetPrecision(ctx);
++
++        int size = get_unbatched_size_in_bytes(ctx, batch_size);
++
++        for (int b = 0; b < batch_size; b++) {
++            AVBufferRef *ref;
++            AVFrame *av_frame = infer_roi_array->infer_ROIs[b]->frame;
++            AVFrameSideData *sd = NULL;
++
++            InferTensorMeta *infer_meta = NULL;
++            TensorsArray *infer_array = NULL;
++            IETensorMeta *new_infer = NULL;
++
++            infer_array = (TensorsArray *)av_mallocz(sizeof(*infer_array));
++            infer_meta = (InferTensorMeta *)av_malloc(sizeof(*infer_meta));
++            av_assert0(infer_meta && infer_array);
++            infer_meta->t_array = infer_array;
++
++            new_infer = (IETensorMeta *)av_mallocz(sizeof(*new_infer));
++            av_assert0(new_infer);
++
++            fill_tensor_metadata(new_infer, layer_name, model_name, precision, layout, dim.num_dims, dim.dims,
++                                 (void *)(data + b * size), size);
++
++            av_dynarray_add(&infer_meta->t_array->tensors, &infer_meta->t_array->num, new_infer);
++
++            ref = av_buffer_create((uint8_t *)infer_meta, sizeof(*infer_meta), &infer_tensor_metadata_buffer_free, NULL,
++                                   0);
++            if (ref == NULL) {
++                infer_tensor_metadata_buffer_free(NULL, (uint8_t *)infer_meta);
++                av_assert0(ref);
++            }
++            // add meta data to side data
++            sd = av_frame_new_side_data_from_buf(av_frame, AV_FRAME_DATA_INFERENCE_INFER, ref);
++            if (sd == NULL) {
++                av_buffer_unref(&ref);
++                av_assert0(sd);
++            }
++            VAII_LOGD("av_frame:%p sd:%d\n", av_frame, av_frame->nb_side_data);
++        }
++    }
++}
++
+ PostProcFunction getPostProcFunctionByName(const char *name, const char *model) {
+     if (name == NULL || model == NULL)
+         return NULL;
+@@ -780,9 +859,11 @@ PostProcFunction getPostProcFunctionByName(const char *name, const char *model)
+         if (strstr(model, "yolo"))
+             return (PostProcFunction)ExtractYOLOV3BoundingBoxes;
+         else
+-            return (PostProcFunction)ExtractBoundingBoxes;
++            return (PostProcFunction)ExtractSSDBoundingBoxes;
+     } else if (!strcmp(name, "classify")) {
+-        return (PostProcFunction)Blob2RoiMeta;
++        return (PostProcFunction)ExtractClassifyResults;
++    } else if (!strcmp(name, "infer")) {
++        return (PostProcFunction)ExtractInferResults;
+     }
+     return NULL;
+ }
+diff --git a/libavfilter/inference_backend/image_inference.h b/libavfilter/inference_backend/image_inference.h
+index 5be91e6498..9a81b9e021 100644
+--- a/libavfilter/inference_backend/image_inference.h
++++ b/libavfilter/inference_backend/image_inference.h
+@@ -75,7 +75,7 @@ struct ImageInference {
+ 
+     /* create image inference engine w/ asynchronous input preprocessing */
+     int (*CreateAsyncPreproc)(ImageInferenceContext *async_preproc_context, ImageInferenceContext *inference_context,
+-                              PreProcContext *preproc_context, int image_queue_size, void *opaque);
++                              int image_queue_size, int, void *opaque);
+ 
+     /* submit image */
+     void (*SubmitImage)(ImageInferenceContext *ctx, const Image *image, IFramePtr user_data,
+diff --git a/libavfilter/inference_backend/image_inference_async_preproc.c b/libavfilter/inference_backend/image_inference_async_preproc.c
+index 5ab87cea40..e564214e97 100644
+--- a/libavfilter/inference_backend/image_inference_async_preproc.c
++++ b/libavfilter/inference_backend/image_inference_async_preproc.c
+@@ -43,17 +43,25 @@ static void PreprocImagesFree(PreprocImage **imgs, size_t num_imgs) {
+ }
+ 
+ static int ImageInferenceAsyncPreprocCreate(ImageInferenceContext *async_preproc_context,
+-                                            ImageInferenceContext *inference_context, PreProcContext *preproc_context,
+-                                            int image_queue_size, void *opaque) {
++                                            ImageInferenceContext *inference_context, int image_queue_size,
++                                            int vpp_device_type, void *opaque) {
+     int ret = 0;
+     int width = 0, height = 0, format = 0;
+     ImageInferenceAsyncPreproc *async_preproc = (ImageInferenceAsyncPreproc *)async_preproc_context->priv;
+-    PreProcInitParam pp_init_param = {};
+-    assert(inference_context && preproc_context);
++    PreProcInitParam pp_init_param = {0};
++    PreProcContext *preproc_context;
++
++    assert(inference_context);
+ 
+     VAII_INFO("Using async preproc image inference.");
+ 
+     async_preproc->actual = inference_context;
++
++    preproc_context =
++        (vpp_device_type == 1 /* VPP_DEVICE_HW */) ? CreatePreProcessor("vaapi") : CreatePreProcessor("mocker");
++
++    assert(preproc_context);
++
+     async_preproc->pre_proc = preproc_context;
+ 
+     // TODO: create image pool
+@@ -194,7 +202,7 @@ static void ImageInferenceAsyncPreprocClose(ImageInferenceContext *ctx) {
+     infer->Close(infer_ctx);
+     image_inference_free(infer_ctx);
+     pp_ctx->pre_proc->Destroy(pp_ctx);
+-    pre_proc_free(pp_ctx);
++    ReleasePreProcessor(pp_ctx);
+ 
+     PreprocImagesFree(async_preproc->preproc_images, async_preproc->num_preproc_images);
+ 
+diff --git a/libavfilter/inference_backend/metaconverter.h b/libavfilter/inference_backend/metaconverter.h
+index 3370b83b17..d5e6b2964b 100644
+--- a/libavfilter/inference_backend/metaconverter.h
++++ b/libavfilter/inference_backend/metaconverter.h
+@@ -20,10 +20,10 @@
+ 
+ #pragma once
+ 
++#include "ff_base_inference.h"
+ #include "libavfilter/avfilter.h"
+ #include <json-c/json.h>
+ #include <libavutil/frame.h>
+-#include "ff_base_inference.h"
+ 
+ typedef enum {
+     FFVA_METACONVERT_TENSOR2TEXT,
+diff --git a/libavfilter/inference_backend/model_proc.c b/libavfilter/inference_backend/model_proc.c
+index 88490f46c3..5f1604975f 100644
+--- a/libavfilter/inference_backend/model_proc.c
++++ b/libavfilter/inference_backend/model_proc.c
+@@ -107,23 +107,6 @@ end:
+     return proc_config;
+ }
+ 
+-void model_proc_load_default_config_file(ModelInputPreproc *preproc, ModelOutputPostproc *postproc) {
+-    if (preproc) {
+-        /*
+-         * format is a little tricky, an ideal input format for IE is BGR planer
+-         * however, neither soft csc nor hardware vpp could support that format.
+-         * Here, we set a close soft format. The actual one coverted before sent
+-         * to IE will be decided by user config and hardware vpp used or not.
+-         */
+-        preproc->color_format = AV_PIX_FMT_BGR24;
+-        preproc->layer_name = NULL;
+-    }
+-
+-    if (postproc) {
+-        // do nothing
+-    }
+-}
+-
+ int model_proc_parse_input_preproc(const void *json, ModelInputPreproc *m_preproc) {
+     json_object *jvalue, *preproc, *color, *layer, *object_class;
+     int ret;
+@@ -272,4 +255,4 @@ void model_proc_release_model_proc(const void *json, ModelInputPreproc *preproc,
+     }
+ 
+     json_object_put((json_object *)json);
+-}
+\ No newline at end of file
++}
+diff --git a/libavfilter/inference_backend/model_proc.h b/libavfilter/inference_backend/model_proc.h
+index e4289d45e5..72d506a0d8 100644
+--- a/libavfilter/inference_backend/model_proc.h
++++ b/libavfilter/inference_backend/model_proc.h
+@@ -24,8 +24,6 @@
+ 
+ void *model_proc_read_config_file(const char *path);
+ 
+-void model_proc_load_default_config_file(ModelInputPreproc *preproc, ModelOutputPostproc *postproc);
+-
+ int model_proc_parse_input_preproc(const void *json, ModelInputPreproc *m_preproc);
+ 
+ int model_proc_parse_output_postproc(const void *json, ModelOutputPostproc *m_postproc);
+@@ -34,4 +32,4 @@ void model_proc_release_model_proc(const void *json, ModelInputPreproc *preproc,
+ 
+ int model_proc_get_file_size(FILE *fp);
+ 
+-void infer_labels_buffer_free(void *opaque, uint8_t *data);
+\ No newline at end of file
++void infer_labels_buffer_free(void *opaque, uint8_t *data);
+diff --git a/libavfilter/inference_backend/openvino_image_inference.c b/libavfilter/inference_backend/openvino_image_inference.c
+index e0bdd2ee58..af979d7cf0 100644
+--- a/libavfilter/inference_backend/openvino_image_inference.c
++++ b/libavfilter/inference_backend/openvino_image_inference.c
+@@ -24,13 +24,12 @@
+ #include "image_inference.h"
+ #include "logger.h"
+ #include "openvino_image_inference.h"
++#include "wrap_image.h"
+ 
+ #define II_MAX(a, b) ((a) > (b) ? (a) : (b))
+ #define II_MIN(a, b) ((a) > (b) ? (b) : (a))
+ 
+-typedef enum { VPP_DEVICE_HW, VPP_DEVICE_SW } DEVICE_TYPE;
+-
+-static inline void* mallocz(size_t size) {
++static inline void *mallocz(size_t size) {
+     void *ptr = malloc(size);
+     if (ptr)
+         memset(ptr, 0, size);
+@@ -38,12 +37,13 @@ static inline void* mallocz(size_t size) {
+ }
+ 
+ static ie_config_t *StringToIEConfig(const char *configs, char **pre_processor_name, char **multi_device_list,
+-                                     char **hetero_device_list, char**image_format) {
++                                     char **hetero_device_list, char **image_format) {
+ 
+     ie_config_t *config_res = NULL, *cfg_tmp = NULL;
+     char *key = NULL, *value = NULL, *configs_temp = NULL;
+ 
+-    if (!configs) return NULL;
++    if (!configs)
++        return NULL;
+ 
+     configs_temp = (char *)mallocz(strlen(configs) + 1);
+     assert(configs_temp);
+@@ -104,11 +104,11 @@ static ie_config_t *StringToIEConfig(const char *configs, char **pre_processor_n
+     return config_res;
+ }
+ 
+-static void ie_config_free(ie_config_t *config) {
++static void FreeIEConfigs(ie_config_t *config) {
+     while (config) {
+         ie_config_t *_tmp = config;
+         config = _tmp->next;
+-        free((char *)_tmp->name),
++        free((char *)_tmp->name);
+         free((char *)_tmp->value);
+         _tmp->name = NULL, _tmp->value = NULL, _tmp->next = NULL;
+         free(_tmp);
+@@ -118,24 +118,15 @@ static void ie_config_free(ie_config_t *config) {
+ 
+ static void completion_callback(void *args);
+ 
+-static inline int getNumberChannels(int format) {
+-    switch (format) {
+-    case FOURCC_BGRA:
+-    case FOURCC_BGRX:
+-    case FOURCC_RGBA:
+-    case FOURCC_RGBX:
+-        return 4;
+-    case FOURCC_BGR:
+-        return 3;
+-    }
+-    return 0;
+-}
+-
+ static colorformat_e FormatNameToIEColorFormat(const char *format) {
+-    static const char *formats[] = {"NV12", "RGB", "BGR", "RGBX", "BGRX", "RGBA", "BGRA"};
+-    const colorformat_e ie_color_formats[] = {NV12, RGB, BGR, RGBX, BGRX, RGBX, BGRX};
++    static const char *formats[] = {"NV12", "I420", "RGB", "BGR", "RGBX", "BGRX", "RGBA", "BGRA"};
++    const colorformat_e ie_color_formats[] = {NV12, I420, RGB, BGR, RGBX, BGRX, RGBX, BGRX};
++    int num_formats;
++
++    if (!format)
++        return RAW;
+ 
+-    int num_formats = sizeof(formats) / sizeof(formats[0]);
++    num_formats = sizeof(formats) / sizeof(formats[0]);
+     for (int i = 0; i < num_formats; i++) {
+         if (!strcmp(format, formats[i]))
+             return ie_color_formats[i];
+@@ -145,12 +136,35 @@ static colorformat_e FormatNameToIEColorFormat(const char *format) {
+     return RAW;
+ }
+ 
+-static inline void RectToIERoi(roi_t *roi, const Rectangle *rect) {
+-    roi->id = 0;
+-    roi->posX = rect->x;
+-    roi->posY = rect->y;
+-    roi->sizeX = rect->width;
+-    roi->sizeY = rect->height;
++static void SubmitExtraInputBlob(OpenVINOImageInference *vino, const BatchRequest *request, Image *image) {
++    ie_blob_t *input_blob = NULL;
++    dimensions_t blob_dims = {};
++    ie_blob_buffer_t blob_buffer;
++    float *blob_data = NULL;
++
++    ie_infer_request_get_blob(request->infer_request, vino->input_name_imginfo, &input_blob);
++    ie_blob_get_dims(input_blob, &blob_dims);
++
++    // Fill input tensor with values
++    ie_blob_get_buffer(input_blob, &blob_buffer);
++    blob_data = (float *)(blob_buffer.buffer);
++
++    if (!strcmp(vino->input_name_imginfo, "seq_ind")) {
++        int maxSequenceSizePerPlate = blob_dims.dims[0];
++        blob_data[0] = 0.0f;
++        for (int n = 1; n < maxSequenceSizePerPlate; n++)
++            blob_data[n] = 1.0f;
++    } else if (!strcmp(vino->input_name_imginfo, "im_info")) {
++        for (int i = 0; i < vino->batch_size; i++) {
++            blob_data[i * blob_dims.dims[1] + 0] = (float)image->height;
++            blob_data[i * blob_dims.dims[1] + 1] = (float)image->width;
++
++            for (int k = 2; k < blob_dims.dims[1]; k++) {
++                blob_data[i * blob_dims.dims[1] + k] = 1.0f; // all scale factors are set to 1.0
++            }
++        }
++    }
++    ie_blob_free(&input_blob);
+ }
+ 
+ static void GetNextImageBuffer(ImageInferenceContext *ctx, const BatchRequest *request, Image *image) {
+@@ -179,7 +193,8 @@ static void GetNextImageBuffer(ImageInferenceContext *ctx, const BatchRequest *r
+     image->stride[0] = image->width;
+     image->stride[1] = image->width;
+     image->stride[2] = image->width;
+-    ie_blob_destroy(&input_blob);
++
++    ie_blob_free(&input_blob);
+ }
+ 
+ static inline Image ApplyCrop(const Image *src) {
+@@ -251,45 +266,18 @@ static void SubmitImagePreProcess(ImageInferenceContext *ctx, const BatchRequest
+                                   PreProcessor preProcessor) {
+     OpenVINOImageInference *vino = (OpenVINOImageInference *)ctx->priv;
+ 
+-    if (vino->resize_by_inference) {
+-
++    if (!vino->pre_processor) {
++        ie_blob_t *blob_ptr = NULL;
+         // ie preprocess can only support system memory right now
+         assert(pSrc->type == MEM_TYPE_SYSTEM);
+-        if (pSrc->format != FOURCC_NV12) {
+-            roi_t roi, *_roi = NULL;
+-            ie_blob_t *input_blob = NULL;
+-            tensor_desc_t tensor = {NHWC, {4, {1, getNumberChannels(pSrc->format), pSrc->height, pSrc->width}}, U8};
+-            if (pSrc->rect.width != 0 && pSrc->rect.height != 0) {
+-                RectToIERoi(&roi, &pSrc->rect);
+-                _roi = &roi;
+-            }
+ 
+-            ie_blob_make_memory_from_preallocated(&tensor, pSrc->planes[0], 0, &input_blob);
+-            if (_roi) {
+-                ie_blob_t *input_blob_roi = NULL;
+-                ie_blob_make_memory_with_roi(input_blob, _roi, &input_blob_roi);
+-                ie_infer_request_set_blob(request->infer_request, vino->input_name, input_blob_roi);
+-                ie_blob_destroy(&input_blob_roi);
+-            } else {
+-                ie_infer_request_set_blob(request->infer_request, vino->input_name, input_blob);
+-                ie_blob_destroy(&input_blob);
+-            }
+-        } else {
+-            Image src = {};
+-            src = ApplyCrop(pSrc);
+-
+-            ie_blob_t *y_blob = NULL, *uv_blob = NULL, *nv12_blob = NULL;
+-            tensor_desc_t y_tensor = {NHWC, {4, {1, 1, src.height - src.height % 2, src.width - src.width % 2}}, U8};
+-            tensor_desc_t uv_tensor = {NHWC, {4, {1, 2, src.height / 2, src.width / 2}}, U8};
+-            ie_blob_make_memory_from_preallocated(&y_tensor, src.planes[0], 0, &y_blob);
+-            ie_blob_make_memory_from_preallocated(&uv_tensor, src.planes[1], 0, &uv_blob);
+-            ie_blob_make_memory_nv12(y_blob, uv_blob, &nv12_blob);
+-
+-            ie_infer_request_set_blob(request->infer_request, vino->input_name, nv12_blob);
+-            ie_blob_destroy(&y_blob);
+-            ie_blob_destroy(&uv_blob);
+-            ie_blob_destroy(&nv12_blob);
+-        }
++        blob_ptr = WrapImageToBlob(pSrc);
++
++        assert(blob_ptr);
++
++        ie_infer_request_set_blob(request->infer_request, vino->input_name, blob_ptr);
++
++        ie_blob_free(&blob_ptr);
+     } else {
+         Image src = {};
+         Image dst = {};
+@@ -297,20 +285,17 @@ static void SubmitImagePreProcess(ImageInferenceContext *ctx, const BatchRequest
+         dst.type = pSrc->type;
+         GetNextImageBuffer(ctx, request, &dst);
+ 
++        if (vino->input_name_imginfo != NULL) {
++            SubmitExtraInputBlob(vino, request, &dst);
++        }
++
+         if (pSrc->planes[0] != dst.planes[0]) { // only convert if different buffers
+-            if (!vino->vpp_ctx) {
+-                vino->vpp_ctx = pre_proc_alloc(pre_proc_get_by_type(MEM_TYPE_SYSTEM));
+-                assert(vino->vpp_ctx);
+-            }
+-#ifdef HAVE_GAPI
+-            vino->vpp_ctx->pre_proc->Convert(vino->vpp_ctx, &src, &dst, 0);
+-#else
+             if (pSrc->type == MEM_TYPE_SYSTEM)
+                 src = ApplyCrop(pSrc);
+             else
+                 src = *pSrc;
+-            vino->vpp_ctx->pre_proc->Convert(vino->vpp_ctx, &src, &dst, 0);
+-#endif
++            vino->pre_processor->pre_proc->Convert(vino->pre_processor, &src, &dst, 0);
++
+             // model specific pre-processing
+             if (preProcessor)
+                 preProcessor(&dst);
+@@ -318,6 +303,25 @@ static void SubmitImagePreProcess(ImageInferenceContext *ctx, const BatchRequest
+     }
+ }
+ 
++static char *CreateDeviceList(const char *devices, char *multi_devices, char *hetero_devices) {
++    char *_devices = NULL;
++
++    char *device_list =
++        (!strcmp(devices, "MULTI")) ? multi_devices : (!strcmp(devices, "HETERO") ? hetero_devices : NULL);
++
++    if (device_list) {
++        _devices = (char *)malloc(strlen(devices) + strlen(device_list) + 2);
++        assert(_devices);
++
++        memset(_devices, 0, sizeof(*_devices));
++        strcpy(_devices, devices);
++        strcat(_devices, ":");
++        strcat(_devices, device_list);
++    }
++
++    return _devices;
++}
++
+ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType type, const char *devices,
+                                         const char *model, int batch_size, int nireq, const char *configs,
+                                         void *allocator, CallbackFunc callback) {
+@@ -346,49 +350,12 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+         return -1;
+     }
+ 
+-    if (configs) {
+-        ie_config_t *_configs = StringToIEConfig(configs, &pre_processor_name, &multi_device_list,
+-                                                &hetero_device_list, &image_format);
++    if (configs && strlen(configs) > 0) {
++        ie_config_t *_configs =
++            StringToIEConfig(configs, &pre_processor_name, &multi_device_list, &hetero_device_list, &image_format);
+         ie_core_set_config(vino->core, _configs, devices);
+-        vino->resize_by_inference = (pre_processor_name && !strcmp(pre_processor_name, "ie")) ? 1 : 0;
+-
+-        if (!strcmp(devices, "MULTI")) {
+-            if (multi_device_list) {
+-                _devices = (char *)malloc(strlen(devices) + strlen(multi_device_list) + 2);
+-                if (!_devices) {
+-                    VAII_ERROR("Not enough memory!");
+-                    ie_config_free(_configs);
+-                    goto err;
+-                }
+-                memset(_devices, 0, sizeof(*_devices));
+-                strcpy(_devices, devices);
+-                strcat(_devices, ":");
+-                strcat(_devices, multi_device_list);
+-            }
+-        } else if (!strcmp(devices, "HETERO")) {
+-            if (hetero_device_list) {
+-                _devices = (char *)malloc(strlen(devices) + strlen(hetero_device_list) + 2);
+-                if (!_devices) {
+-                    VAII_ERROR("Not enough memory!");
+-                    ie_config_free(_configs);
+-                    goto err;
+-                }
+-                memset(_devices, 0, sizeof(*_devices));
+-                strcpy(_devices, devices);
+-                strcat(_devices, ":");
+-                strcat(_devices, hetero_device_list);
+-            }
+-        }
+-
+-        ie_config_free(_configs);
+-
+-        if (pre_processor_name)
+-            free(pre_processor_name);
+-        if (hetero_device_list)
+-            free(hetero_device_list);
+-        if (multi_device_list)
+-            free(multi_device_list);
+-        pre_processor_name = NULL, hetero_device_list = NULL, multi_device_list = NULL;
++        _devices = CreateDeviceList(devices, multi_device_list, hetero_device_list);
++        FreeIEConfigs(_configs);
+     }
+ 
+     // Read network
+@@ -407,6 +374,39 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+         goto err;
+     }
+ 
++    if (input_num > 2) {
++        VAII_ERROR("Network should have 1 or 2 inputs!");
++        goto err;
++    }
++
++    ie_network_get_input_name(vino->network, 0, &vino->input_name);
++    if (!vino->input_name) {
++        VAII_ERROR("Get network input name failed!");
++        goto err;
++    }
++    ie_network_set_input_precision(vino->network, vino->input_name, U8);
++
++    // Some models have 2 inputs: Faster-RCNN and LPR converted from Caffe
++    // Now all LRR models we use are converted from Caffe
++    if (input_num == 2) {
++        ie_network_get_input_name(vino->network, 1, &vino->input_name_imginfo);
++        if (!vino->input_name_imginfo) {
++            VAII_ERROR("Get network input name failed!");
++            goto err;
++        }
++
++        if (!strcmp(vino->input_name_imginfo, "im_info")) {
++            dimensions_t input_dims = {};
++
++            ie_network_get_input_dims(vino->network, vino->input_name_imginfo, &input_dims);
++            ie_network_set_input_precision(vino->network, vino->input_name_imginfo, FP32);
++            if (input_dims.dims[1] != 3 && input_dims.dims[1] != 6) {
++                VAII_ERROR("Invalid input info. Should be 3 or 6 values length\n");
++                goto err;
++            }
++        }
++    }
++
+     ie_network_get_input_shapes(vino->network, &network_input_shapes);
+     if (batch_size > 1 && network_input_shapes.shapes) {
+         for (int i = 0; i < network_input_shapes.shape_num; i++)
+@@ -416,24 +416,18 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+     ie_network_input_shapes_free(&network_input_shapes);
+     network_input_shapes.shape_num = 0;
+ 
+-    ie_network_get_input_name(vino->network, 0, &vino->input_name);
+-    if (!vino->input_name) {
+-        VAII_ERROR("Get network input name failed!");
+-        goto err;
+-    }
+-
+-    ie_network_set_input_precision(vino->network, vino->input_name, U8);
+     ie_network_set_input_layout(vino->network, vino->input_name, NCHW);
+ 
+-    if (image_format) {
+-        vino->ie_color_format = FormatNameToIEColorFormat(image_format);
+-        ie_network_set_color_format(vino->network, vino->input_name, vino->ie_color_format);
+-        free(image_format);
+-        image_format = NULL;
+-    }
++    if (pre_processor_name && !strcmp(pre_processor_name, "ie")) {
++        if (batch_size > 1) {
++            VAII_ERROR("IE pre processing doesn't support batch mode yet!");
++            goto err;
++        }
+ 
+-    if (vino->resize_by_inference) {
+         ie_network_set_input_resize_algorithm(vino->network, vino->input_name, RESIZE_BILINEAR);
++        ie_network_set_color_format(vino->network, vino->input_name, FormatNameToIEColorFormat(image_format));
++    } else {
++        vino->pre_processor = CreatePreProcessor(pre_processor_name);
+     }
+ 
+     // Load network
+@@ -445,6 +439,7 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+         VAII_ERROR("Creat executable network failed!");
+         goto err;
+     }
++
+     if (_devices)
+         free(_devices);
+ 
+@@ -460,7 +455,7 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+         goto err;
+     }
+     vino->num_reqs = nireq;
+-    for (size_t i = 0 ; i < vino->num_reqs; ++i) {
++    for (size_t i = 0; i < vino->num_reqs; ++i) {
+         ie_exec_network_create_infer_request(vino->exe_network, &vino->infer_requests[i]);
+         if (!vino->infer_requests[i]) {
+             VAII_ERROR("Creat infer requests failed!");
+@@ -505,7 +500,17 @@ static int OpenVINOImageInferenceCreate(ImageInferenceContext *ctx, MemoryType t
+     pthread_mutex_init(&vino->count_mutex, NULL);
+     pthread_cond_init(&vino->request_processed, NULL);
+ 
++    if (pre_processor_name)
++        free(pre_processor_name);
++    if (hetero_device_list)
++        free(hetero_device_list);
++    if (multi_device_list)
++        free(multi_device_list);
++    if (image_format)
++        free(image_format);
++
+     return 0;
++
+ err:
+     if (pre_processor_name)
+         free(pre_processor_name);
+@@ -537,7 +542,7 @@ err:
+     if (vino->freeRequests)
+         SafeQueueDestroy(vino->freeRequests);
+     if (vino->exe_network)
+-        ie_exec_network_free (&vino->exe_network);
++        ie_exec_network_free(&vino->exe_network);
+     if (vino->network)
+         ie_network_free(&vino->network);
+     if (vino->core)
+@@ -546,7 +551,7 @@ err:
+ }
+ 
+ static void OpenVINOImageInferenceSubmtImage(ImageInferenceContext *ctx, const Image *image, IFramePtr user_data,
+-                                             PreProcessor pre_processor) {
++                                             PreProcessor preproc_func) {
+     OpenVINOImageInference *vino = (OpenVINOImageInference *)ctx->priv;
+     const Image *pSrc = image;
+     BatchRequest *request = NULL;
+@@ -559,39 +564,12 @@ static void OpenVINOImageInferenceSubmtImage(ImageInferenceContext *ctx, const I
+ 
+     request = (BatchRequest *)SafeQueuePop(vino->freeRequests);
+ 
+-    SubmitImagePreProcess(ctx, request, pSrc, pre_processor);
++    SubmitImagePreProcess(ctx, request, pSrc, preproc_func);
+ 
+     image_inference_dynarray_add(&request->buffers.frames, &request->buffers.num_buffers, user_data);
+ 
+     // start inference asynchronously if enough buffers for batching
+     if (request->buffers.num_buffers >= vino->batch_size) {
+-#if 1 // TODO: remove when license-plate-recognition-barrier model will take one input
+-        size_t num_inputs;
+-        ie_network_get_inputs_number(vino->network, &num_inputs);
+-        if (num_inputs > 1) {
+-            char *input_name = NULL;
+-            ie_network_get_input_name(vino->network, 1, &input_name);
+-            if (!strcmp(input_name, "seq_ind")) {
+-                // 'seq_ind' input layer is some relic from the training
+-                // it should have the leading 0.0f and rest 1.0f
+-                dimensions_t dims = {};
+-                float *blob_data;
+-                int maxSequenceSizePerPlate;
+-                ie_blob_t *input_blob = NULL;
+-                ie_blob_buffer_t blob_buffer;
+-                ie_infer_request_get_blob(request->infer_request, input_name, &input_blob);
+-                ie_blob_get_dims(input_blob, &dims);
+-                maxSequenceSizePerPlate = dims.dims[0];
+-                ie_blob_get_buffer(input_blob, &blob_buffer);
+-                blob_data = (float *)(blob_buffer.buffer);
+-                blob_data[0] = 0.0f;
+-                for (int n = 1; n < maxSequenceSizePerPlate; n++)
+-                    blob_data[n] = 1.0f;
+-                ie_blob_destroy(&input_blob);
+-            }
+-            ie_network_name_free(&input_name);
+-        }
+-#endif
+         request->callback.completeCallBackFunc = completion_callback;
+         request->callback.args = request;
+         request->inference_ctx = ctx;
+@@ -652,7 +630,7 @@ static void OpenVINOImageInferenceClose(ImageInferenceContext *ctx) {
+     OpenVINOImageInference *vino = (OpenVINOImageInference *)ctx->priv;
+     if (vino->infer_requests) {
+         for (size_t i = 0; i < vino->num_reqs; ++i)
+-            if(vino->infer_requests[i])
++            if (vino->infer_requests[i])
+                 ie_infer_request_free(&vino->infer_requests[i]);
+         free(vino->infer_requests);
+     }
+@@ -671,14 +649,17 @@ static void OpenVINOImageInferenceClose(ImageInferenceContext *ctx) {
+     if (vino->input_name)
+         ie_network_name_free(&vino->input_name);
+ 
++    if (vino->input_name_imginfo)
++        ie_network_name_free(&vino->input_name_imginfo);
++
+     pthread_mutex_destroy(&vino->flush_mutex);
+     pthread_mutex_destroy(&vino->callback_mutex);
+     pthread_mutex_destroy(&vino->count_mutex);
+     pthread_cond_destroy(&vino->request_processed);
+ 
+-    if (vino->vpp_ctx) {
+-        vino->vpp_ctx->pre_proc->Destroy(vino->vpp_ctx);
+-        pre_proc_free(vino->vpp_ctx);
++    if (vino->pre_processor) {
++        vino->pre_processor->pre_proc->Destroy(vino->pre_processor);
++        ReleasePreProcessor(vino->pre_processor);
+     }
+ 
+     ie_exec_network_free(&vino->exe_network);
+@@ -715,10 +696,11 @@ static void completion_callback(void *args) {
+     for (int n = 0; n < blob_array.num_blobs; n++) {
+         OutputBlobContext *blob_ctx = blob_array.output_blobs[n];
+         OpenVINOOutputBlob *vino_blob = (OpenVINOOutputBlob *)blob_ctx->priv;
++        char *output_name = NULL;
+         ie_infer_request_set_blob(request->infer_request, vino_blob->name, vino_blob->blob);
+-        char *output_name = (char *)vino_blob->name;
++        output_name = (char *)vino_blob->name;
+         ie_network_name_free(&output_name);
+-        ie_blob_destroy(&vino_blob->blob);
++        ie_blob_free(&vino_blob->blob);
+         output_blob_free(blob_ctx);
+     }
+     blob_array.num_blobs = 0;
+@@ -756,7 +738,7 @@ static Dimensions OpenVINOOutputBlobGetDims(OutputBlobContext *ctx) {
+ 
+     ie_blob_get_dims(vino_blob->blob, &dims);
+     dims_res.num_dims = dims.ranks;
+-    for (size_t i = 0; i< dims_res.num_dims; ++i)
++    for (size_t i = 0; i < dims_res.num_dims; ++i)
+         dims_res.dims[i] = dims.dims[i];
+     return dims_res;
+ }
+diff --git a/libavfilter/inference_backend/openvino_image_inference.h b/libavfilter/inference_backend/openvino_image_inference.h
+index 3e06396bb0..8535a8e6f4 100644
+--- a/libavfilter/inference_backend/openvino_image_inference.h
++++ b/libavfilter/inference_backend/openvino_image_inference.h
+@@ -20,10 +20,10 @@
+ 
+ #pragma once
+ 
+-#include <ie_c_api.h>
+ #include "image_inference.h"
+ #include "pre_proc.h"
+ #include "safe_queue.h"
++#include <c_api/ie_c_api.h>
+ #include <pthread.h>
+ 
+ typedef struct BatchRequest {
+@@ -36,9 +36,6 @@ typedef struct BatchRequest {
+ } BatchRequest;
+ 
+ typedef struct OpenVINOImageInference {
+-    int resize_by_inference;
+-    colorformat_e ie_color_format;
+-
+     CallbackFunc callback;
+ 
+     // Inference Engine
+@@ -46,6 +43,7 @@ typedef struct OpenVINOImageInference {
+     ie_network_t *network;
+     char *model_name;
+     char *input_name;
++    char *input_name_imginfo; // special for Faster-RCNN and LPR converted from Caffe
+     ie_executable_network_t *exe_network;
+     ie_infer_request_t **infer_requests;
+     size_t num_reqs;
+@@ -56,10 +54,10 @@ typedef struct OpenVINOImageInference {
+     // Threading
+     int batch_size;
+     pthread_t working_thread;
+-    SafeQueueT *freeRequests;    // BatchRequest queue
++    SafeQueueT *freeRequests; // BatchRequest queue
+ 
+     // VPP
+-    PreProcContext *vpp_ctx;
++    PreProcContext *pre_processor;
+ 
+     int already_flushed;
+     unsigned int requests_processing;
+diff --git a/libavfilter/inference_backend/pre_proc.c b/libavfilter/inference_backend/pre_proc.c
+index 623c111dd4..7fa5b758fe 100644
+--- a/libavfilter/inference_backend/pre_proc.c
++++ b/libavfilter/inference_backend/pre_proc.c
+@@ -28,6 +28,11 @@ extern PreProc pre_proc_gapi;
+ extern PreProc pre_proc_vaapi;
+ extern PreProc pre_proc_mocker;
+ 
++static const PreProc *pre_proc_get_by_name(const char *name);
++static const PreProc *pre_proc_get_by_type(MemoryType type);
++static PreProcContext *pre_proc_alloc(const PreProc *pre_proc);
++static void pre_proc_free(PreProcContext *context);
++
+ static const PreProc *const pre_proc_list[] = {
+ #if HAVE_FFMPEG || CONFIG_SWSCALE
+     &pre_proc_swscale,
+@@ -72,7 +77,7 @@ static const PreProc *pre_proc_iterate(void **opaque) {
+     return pp;
+ }
+ 
+-const PreProc *pre_proc_get_by_name(const char *name) {
++static const PreProc *pre_proc_get_by_name(const char *name) {
+     const PreProc *pp = NULL;
+     void *opaque = 0;
+ 
+@@ -86,7 +91,7 @@ const PreProc *pre_proc_get_by_name(const char *name) {
+     return NULL;
+ }
+ 
+-const PreProc *pre_proc_get_by_type(MemoryType type) {
++static const PreProc *pre_proc_get_by_type(MemoryType type) {
+     const PreProc *ret = NULL;
+ 
+     if (type == MEM_TYPE_SYSTEM) {
+@@ -102,7 +107,7 @@ const PreProc *pre_proc_get_by_type(MemoryType type) {
+     return ret;
+ }
+ 
+-PreProcContext *pre_proc_alloc(const PreProc *pre_proc) {
++static PreProcContext *pre_proc_alloc(const PreProc *pre_proc) {
+     PreProcContext *ret;
+ 
+     if (pre_proc == NULL)
+@@ -127,7 +132,7 @@ err:
+     return NULL;
+ }
+ 
+-void pre_proc_free(PreProcContext *context) {
++static void pre_proc_free(PreProcContext *context) {
+     if (context == NULL)
+         return;
+ 
+@@ -136,6 +141,23 @@ void pre_proc_free(PreProcContext *context) {
+     free(context);
+ }
+ 
++PreProcContext *CreatePreProcessor(const char *pre_processor_name) {
++    const PreProc *_pre_proc;
++
++    if (!pre_processor_name)
++        pre_processor_name = "swscale";
++
++    _pre_proc = pre_proc_get_by_name(pre_processor_name);
++    if (!_pre_proc)
++        return NULL;
++
++    return pre_proc_alloc(_pre_proc);
++}
++
++void ReleasePreProcessor(PreProcContext *context) {
++    pre_proc_free(context);
++}
++
+ #ifdef DEBUG
+ #include "logger.h"
+ #include <assert.h>
+@@ -236,4 +258,4 @@ void DumpImageInfo(const Image *p) {
+     VAII_LOGI("Image w:%d h:%d f:%x, plane: %p %p %p  stride: %d %d %d \n", p->width, p->height, p->format,
+               p->planes[0], p->planes[1], p->planes[2], p->stride[0], p->stride[1], p->stride[2]);
+ }
+-#endif
+\ No newline at end of file
++#endif
+diff --git a/libavfilter/inference_backend/pre_proc.h b/libavfilter/inference_backend/pre_proc.h
+index 805767b67d..2cc5734498 100644
+--- a/libavfilter/inference_backend/pre_proc.h
++++ b/libavfilter/inference_backend/pre_proc.h
+@@ -64,13 +64,9 @@ struct PreProcContext {
+ 
+ int GetPlanesCount(int fourcc);
+ 
+-const PreProc *pre_proc_get_by_name(const char *name);
++PreProcContext *CreatePreProcessor(const char *pre_processor_name);
+ 
+-const PreProc *pre_proc_get_by_type(MemoryType type);
+-
+-PreProcContext *pre_proc_alloc(const PreProc *pre_proc);
+-
+-void pre_proc_free(PreProcContext *context);
++void ReleasePreProcessor(PreProcContext *context);
+ 
+ #ifdef DEBUG
+ void DumpBGRpToRgb24File(const Image *out_image);
+@@ -78,4 +74,4 @@ void DumpRGBpToRgb24File(const Image *out_image);
+ void DumpRGBpToFile(const Image *out_image);
+ void DumpBGRAToFile(const Image *out_image);
+ inline void DumpImageInfo(const Image *p);
+-#endif
+\ No newline at end of file
++#endif
+diff --git a/libavfilter/inference_backend/wrap_image.c b/libavfilter/inference_backend/wrap_image.c
+new file mode 100755
+index 0000000000..968b2159e6
+--- /dev/null
++++ b/libavfilter/inference_backend/wrap_image.c
+@@ -0,0 +1,138 @@
++/*
++ * Copyright (c) 2018-2020 Intel Corporation
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++#include <assert.h>
++#include "logger.h"
++#include "wrap_image.h"
++
++static inline int getNumberChannels(int format) {
++    switch (format) {
++    case FOURCC_BGRA:
++    case FOURCC_BGRX:
++    case FOURCC_RGBA:
++    case FOURCC_RGBX:
++        return 4;
++    case FOURCC_BGR:
++        return 3;
++    }
++    return 0;
++}
++
++ie_blob_t *WrapImageToBlob(const Image *img) {
++    ie_blob_t *blob = NULL;
++
++    switch (img->format) {
++    case FOURCC_BGRA:
++    case FOURCC_BGRX:
++    case FOURCC_RGBA:
++    case FOURCC_RGBX:
++    case FOURCC_BGR: {
++        ie_blob_t *input_blob;
++        tensor_desc_t tensor_desc = {NHWC, {4, {1, getNumberChannels(img->format), img->height, img->width}}, U8};
++
++        ie_blob_make_memory_from_preallocated(&tensor_desc, img->planes[0], 0, &input_blob);
++        if (img->rect.width && img->rect.height) {
++            ie_blob_t *input_blob_roi;
++            roi_t roi = {0, (size_t)img->rect.x, (size_t)img->rect.y, (size_t)img->rect.width,
++                         (size_t)img->rect.height};
++            ie_blob_make_memory_with_roi(input_blob, &roi, &input_blob_roi);
++            ie_blob_free(&input_blob);
++            input_blob = input_blob_roi;
++        }
++        blob = input_blob;
++        break;
++    }
++    case FOURCC_I420: {
++        ie_blob_t *y_blob, *u_blob, *v_blob, *i420_blob = NULL;
++        tensor_desc_t y_tensor_desc, u_v_tensor_desc;
++
++        ie_blob_t *y_blob_roi, *u_blob_roi, *v_blob_roi;
++        roi_t crop_roi_y, crop_roi_u_v;
++
++        y_tensor_desc = (tensor_desc_t){NHWC, {4, {1, 1, img->height, img->width}}, U8};
++        u_v_tensor_desc = (tensor_desc_t){NHWC, {4, {1, 1, img->height / 2, img->width / 2}}, U8};
++
++        assert(img->planes[0] && img->planes[1] && img->planes[2]);
++
++        ie_blob_make_memory_from_preallocated(&y_tensor_desc, img->planes[0], 0, &y_blob);
++        ie_blob_make_memory_from_preallocated(&u_v_tensor_desc, img->planes[1], 0, &u_blob);
++        ie_blob_make_memory_from_preallocated(&u_v_tensor_desc, img->planes[2], 0, &v_blob);
++
++        crop_roi_y = (roi_t){0, (size_t)((img->rect.x & 0x1) ? img->rect.x - 1 : img->rect.x),
++                             (size_t)((img->rect.y & 0x1) ? img->rect.y - 1 : img->rect.y),
++                             (size_t)((img->rect.width & 0x1) ? img->rect.width - 1 : img->rect.width),
++                             (size_t)((img->rect.height & 0x1) ? img->rect.height - 1 : img->rect.height)};
++
++        crop_roi_u_v = (roi_t){0, (size_t)img->rect.x / 2, (size_t)img->rect.y / 2, (size_t)img->rect.width / 2,
++                               (size_t)img->rect.height / 2};
++
++        ie_blob_make_memory_with_roi(y_blob, &crop_roi_y, &y_blob_roi);
++        ie_blob_make_memory_with_roi(u_blob, &crop_roi_u_v, &u_blob_roi);
++        ie_blob_make_memory_with_roi(v_blob, &crop_roi_u_v, &v_blob_roi);
++
++        ie_blob_make_memory_i420(y_blob_roi, u_blob_roi, v_blob_roi, &i420_blob);
++        ie_blob_free(&y_blob);
++        ie_blob_free(&u_blob);
++        ie_blob_free(&v_blob);
++        ie_blob_free(&y_blob_roi);
++        ie_blob_free(&u_blob_roi);
++        ie_blob_free(&v_blob_roi);
++
++        blob = i420_blob;
++        break;
++    }
++    case FOURCC_NV12: {
++        ie_blob_t *y_blob, *uv_blob, *nv12_blob = NULL;
++        tensor_desc_t y_tensor_desc, uv_tensor_desc;
++
++        ie_blob_t *y_blob_roi, *uv_blob_roi;
++        roi_t crop_roi_y, crop_roi_uv;
++
++        y_tensor_desc = (tensor_desc_t){NHWC, {4, {1, 1, img->height, img->width}}, U8};
++        uv_tensor_desc = (tensor_desc_t){NHWC, {4, {1, 2, img->height / 2, img->width / 2}}, U8};
++
++        ie_blob_make_memory_from_preallocated(&y_tensor_desc, img->planes[0], 0, &y_blob);
++        ie_blob_make_memory_from_preallocated(&uv_tensor_desc, img->planes[1], 0, &uv_blob);
++
++        crop_roi_y = (roi_t){0, (size_t)((img->rect.x & 0x1) ? img->rect.x - 1 : img->rect.x),
++                             (size_t)((img->rect.y & 0x1) ? img->rect.y - 1 : img->rect.y),
++                             (size_t)((img->rect.width & 0x1) ? img->rect.width - 1 : img->rect.width),
++                             (size_t)((img->rect.height & 0x1) ? img->rect.height - 1 : img->rect.height)};
++
++        crop_roi_uv = (roi_t){0, (size_t)img->rect.x / 2, (size_t)img->rect.y / 2, (size_t)img->rect.width / 2,
++                              (size_t)img->rect.height / 2};
++
++        ie_blob_make_memory_with_roi(y_blob, &crop_roi_y, &y_blob_roi);
++        ie_blob_make_memory_with_roi(uv_blob, &crop_roi_uv, &uv_blob_roi);
++
++        ie_blob_make_memory_nv12(y_blob_roi, uv_blob_roi, &nv12_blob);
++        ie_blob_free(&y_blob);
++        ie_blob_free(&uv_blob);
++        ie_blob_free(&y_blob_roi);
++        ie_blob_free(&uv_blob_roi);
++        blob = nv12_blob;
++        break;
++    }
++    default:
++        VAII_ERROR("Format not support!");
++        return NULL;
++    }
++
++    return blob;
++}
+diff --git a/libavfilter/inference_backend/wrap_image.h b/libavfilter/inference_backend/wrap_image.h
+new file mode 100755
+index 0000000000..6b783ff27f
+--- /dev/null
++++ b/libavfilter/inference_backend/wrap_image.h
+@@ -0,0 +1,27 @@
++/*
++ * Copyright (c) 2018-2020 Intel Corporation
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#pragma once
++
++#include <c_api/ie_c_api.h>
++
++#include "image.h"
++
++ie_blob_t *WrapImageToBlob(const Image *img);
+diff --git a/libavfilter/vf_inference_classify.c b/libavfilter/vf_inference_classify.c
+index 3fd2296934..cd216d0e51 100644
+--- a/libavfilter/vf_inference_classify.c
++++ b/libavfilter/vf_inference_classify.c
+@@ -325,7 +325,6 @@ static const AVOption inference_classify_options[] = {
+     { "dnn_backend",  "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = 1},          0, 2,  FLAGS },
+     { "model",        "path to model file for network",  OFFSET(model),           AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "model_proc",   "model preproc and postproc",      OFFSET(model_proc),      AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+-    { "object_class", "objective class",                 OFFSET(object_class),    AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "device",       "running on device name",          OFFSET(device),          AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "configs",      "configurations to backend",       OFFSET(infer_config),    AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "interval",     "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
+diff --git a/libavfilter/vf_inference_detect.c b/libavfilter/vf_inference_detect.c
+index 2742f3500d..e673fedd18 100644
+--- a/libavfilter/vf_inference_detect.c
++++ b/libavfilter/vf_inference_detect.c
+@@ -309,7 +309,6 @@ static const AVOption inference_detect_options[] = {
+     { "dnn_backend",  "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = 1},          0, 2,  FLAGS },
+     { "model",        "path to model file for network",  OFFSET(model),           AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "model_proc",   "model preproc and postproc",      OFFSET(model_proc),      AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+-    { "object_class", "objective class",                 OFFSET(object_class),    AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "device",       "running on device name",          OFFSET(device),          AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "configs",      "configurations to backend",       OFFSET(infer_config),    AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
+     { "interval",     "detect every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
+diff --git a/libavfilter/vf_inference_infer.c b/libavfilter/vf_inference_infer.c
+new file mode 100755
+index 0000000000..e1ac33fd98
+--- /dev/null
++++ b/libavfilter/vf_inference_infer.c
+@@ -0,0 +1,337 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * video inference filter used for generica infererence
++ */
++
++#include "libavutil/opt.h"
++#include "libavutil/mem.h"
++#include "libavutil/eval.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/mathematics.h"
++
++#include "formats.h"
++#include "internal.h"
++#include "avfilter.h"
++#include "filters.h"
++#include "libavcodec/avcodec.h"
++#include "libavformat/avformat.h"
++#include "libavutil/time.h"
++
++#include "inference_backend/ff_base_inference.h"
++
++#define OFFSET(x) offsetof(IEInferContext, x)
++#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
++
++static int flush_frame(AVFilterContext *ctx, AVFilterLink *outlink, int64_t pts, int64_t *out_pts);
++
++typedef struct IEInferContext {
++    const AVClass *class;
++
++    FFBaseInference *base;
++
++    FF_INFERENCE_OPTIONS
++
++    int    async_preproc;
++    int    backend_type;
++    int    already_flushed;
++} IEInferContext;
++
++static int query_formats(AVFilterContext *context)
++{
++    AVFilterFormats *formats_list;
++    const enum AVPixelFormat pixel_formats[] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_NV12,  AV_PIX_FMT_BGR24,
++                                                 AV_PIX_FMT_BGRA,    AV_PIX_FMT_BGR0,  AV_PIX_FMT_RGBP,
++                                                 AV_PIX_FMT_BGRA,    AV_PIX_FMT_VAAPI, AV_PIX_FMT_NONE};
++
++    formats_list = ff_make_format_list(pixel_formats);
++    if (!formats_list) {
++        av_log(context, AV_LOG_ERROR, "Could not create formats list\n");
++        return AVERROR(ENOMEM);
++    }
++
++    return ff_set_common_formats(context, formats_list);
++}
++
++static int config_input(AVFilterLink *inlink)
++{
++    int ret = 0;
++    AVFilterContext *ctx = inlink->dst;
++    IEInferContext *s = ctx->priv;
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
++    if (desc == NULL)
++        return AVERROR(EINVAL);
++
++    FFInferenceParam param = { };
++    param = s->base->param;
++
++    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
++        AVHWFramesContext *hw_frm_ctx = (AVHWFramesContext *)inlink->hw_frames_ctx->data;
++        AVHWDeviceContext *dev_ctx = (AVHWDeviceContext *)hw_frm_ctx->device_ref->data;
++#if CONFIG_VAAPI
++        param.vpp_device = VPP_DEVICE_HW;
++        param.opaque = (void *)((AVVAAPIDeviceContext *)dev_ctx->hwctx)->display;
++#endif
++        for (int i = 0; i < ctx->nb_outputs; i++) {
++            if (!ctx->outputs[i]->hw_frames_ctx)
++                ctx->outputs[i]->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++        }
++    }
++
++    ret = av_base_inference_set_params(s->base, &param);
++
++    return ret;
++}
++
++static av_cold int infer_init(AVFilterContext *ctx)
++{
++    int ret;
++    IEInferContext *s = ctx->priv;
++    av_assert0(s->model);
++    FFInferenceParam param = { };
++
++    param.model           = s->model;
++    param.device          = s->device;
++    param.nireq           = s->nireq;
++    param.batch_size      = s->batch_size;
++    param.every_nth_frame = s->every_nth_frame;
++    param.threshold       = s->threshold;
++    param.is_full_frame   = 1;
++    param.infer_config    = s->infer_config;
++    param.model_proc      = s->model_proc;
++    param.opaque          = s->async_preproc ? (void *)MOCKER_PRE_PROC_MAGIC : 0;
++
++    s->base = av_base_inference_create(ctx->filter->name);
++    if (!s->base) {
++        av_log(ctx, AV_LOG_ERROR, "Could not create inference.\n");
++        return AVERROR(EINVAL);
++    }
++    ret = av_base_inference_init(s->base, &param);
++
++    return ret;
++}
++
++static av_cold void infer_uninit(AVFilterContext *ctx)
++{
++    IEInferContext *s = ctx->priv;
++
++    flush_frame(ctx, NULL, 0LL, NULL);
++
++    av_base_inference_release(s->base);
++}
++
++static int flush_frame(AVFilterContext *ctx, AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
++{
++    int ret = 0;
++    IEInferContext *s = ctx->priv;
++
++    if (s->already_flushed)
++        return ret;
++
++    while (!av_base_inference_frame_queue_empty(ctx, s->base)) {
++        AVFrame *output = NULL;
++        av_base_inference_get_frame(ctx, s->base, &output);
++        if (output) {
++            if (outlink) {
++                ret = ff_filter_frame(outlink, output);
++                if (out_pts)
++                    *out_pts = output->pts + pts;
++            } else {
++                av_frame_free(&output);
++            }
++        }
++
++        av_base_inference_send_event(ctx, s->base, INFERENCE_EVENT_EOS);
++        av_usleep(5000);
++    }
++
++    s->already_flushed = 1;
++    return ret;
++}
++
++static int load_balance(AVFilterContext *ctx)
++{
++    AVFilterLink *inlink = ctx->inputs[0];
++    AVFilterLink *outlink = ctx->outputs[0];
++    IEInferContext *s = ctx->priv;
++    AVFrame *in = NULL, *output = NULL;
++    int64_t pts;
++    int ret, status;
++    int resource, got_frames = 0;
++    int get_frame_status;
++
++    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
++
++    // drain all processed frames
++    do {
++        get_frame_status = av_base_inference_get_frame(ctx, s->base, &output);
++        if (output) {
++            int ret_val = ff_filter_frame(outlink, output);
++            if (ret_val < 0)
++                return ret_val;
++
++            got_frames = 1;
++            output = NULL;
++        }
++    } while (get_frame_status == 0);
++
++    status = ff_outlink_get_status(inlink);
++    if (status)
++        resource = ff_inlink_queued_frames(inlink);
++    else
++        resource = av_base_inference_resource_status(ctx, s->base);
++
++    while (resource > 0) {
++        ret = ff_inlink_consume_frame(inlink, &in);
++        if (ret < 0)
++            return ret;
++        if (ret == 0)
++            break;
++        if (ret > 0) {
++            av_base_inference_send_frame(ctx, s->base, in);
++        }
++        resource--;
++    }
++
++    if (!status && got_frames)
++        return 0;
++
++    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
++        if (status == AVERROR_EOF) {
++            int64_t out_pts = pts;
++
++            av_log(ctx, AV_LOG_INFO, "Get EOS.\n");
++            ret = flush_frame(ctx, outlink, pts, &out_pts);
++            ff_outlink_set_status(outlink, status, out_pts);
++            return ret;
++        }
++    }
++
++    FF_FILTER_FORWARD_WANTED(outlink, inlink);
++
++    return FFERROR_NOT_READY;
++}
++
++static int activate(AVFilterContext *ctx)
++{
++    AVFilterLink *inlink = ctx->inputs[0];
++    AVFilterLink *outlink = ctx->outputs[0];
++    IEInferContext *s = ctx->priv;
++    AVFrame *in = NULL, *output = NULL;
++    int64_t pts;
++    int ret, status;
++    int got_frame = 0;
++
++    if (av_load_balance_get())
++        return load_balance(ctx);
++
++    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
++
++    do {
++        int get_frame_status;
++        // drain all input frames
++        ret = ff_inlink_consume_frame(inlink, &in);
++        if (ret < 0)
++            return ret;
++        if (ret > 0)
++            av_base_inference_send_frame(ctx, s->base, in);
++
++        // drain all processed frames
++        do {
++            get_frame_status = av_base_inference_get_frame(ctx, s->base, &output);
++            if (output) {
++                int ret_val = ff_filter_frame(outlink, output);
++                if (ret_val < 0)
++                    return ret_val;
++
++                got_frame = 1;
++                output = NULL;
++            }
++        } while (get_frame_status == 0);
++    } while (ret > 0);
++
++    // if frame got, schedule to next filter
++    if (got_frame)
++        return 0;
++
++    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
++        if (status == AVERROR_EOF) {
++            int64_t out_pts = pts;
++
++            av_log(ctx, AV_LOG_INFO, "Get EOS.\n");
++            ret = flush_frame(ctx, outlink, pts, &out_pts);
++            ff_outlink_set_status(outlink, status, out_pts);
++            return ret;
++        }
++    }
++
++    FF_FILTER_FORWARD_WANTED(outlink, inlink);
++
++    return FFERROR_NOT_READY;
++}
++
++static const AVOption inference_infer_options[] = {
++    { "dnn_backend",  "DNN backend for model execution", OFFSET(backend_type),    AV_OPT_TYPE_FLAGS,  { .i64 = 1},          0, 2,  FLAGS },
++    { "model",        "path to model file for network",  OFFSET(model),           AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
++    { "model_proc",   "model preproc and postproc",      OFFSET(model_proc),      AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
++    { "device",       "running on device name",          OFFSET(device),          AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
++    { "configs",      "configurations to backend",       OFFSET(infer_config),    AV_OPT_TYPE_STRING, { .str = NULL},       0, 0,  FLAGS },
++    { "interval",     "infer every Nth frame",          OFFSET(every_nth_frame), AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1024, FLAGS},
++    { "nireq",        "inference request number",        OFFSET(nireq),           AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 128,  FLAGS},
++    { "batch_size",   "batch size per infer",            OFFSET(batch_size),      AV_OPT_TYPE_INT,    { .i64 = 1 },  1, 1000, FLAGS},
++    { "threshold",    "threshod to filter output data",  OFFSET(threshold),       AV_OPT_TYPE_FLOAT,  { .dbl = 0.5}, 0, 1,    FLAGS},
++    { "async_preproc", "do asynchronous preproc in inference backend", OFFSET(async_preproc), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
++
++    { NULL }
++};
++
++AVFILTER_DEFINE_CLASS(inference_infer);
++
++static const AVFilterPad infer_inputs[] = {
++    {
++        .name          = "default",
++        .type          = AVMEDIA_TYPE_VIDEO,
++        .config_props  = config_input,
++    },
++    { NULL }
++};
++
++static const AVFilterPad infer_outputs[] = {
++    {
++        .name          = "default",
++        .type          = AVMEDIA_TYPE_VIDEO,
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_inference_infer = {
++    .name          = "infer",
++    .description   = NULL_IF_CONFIG_SMALL("Gerneric Video Inference Filter."),
++    .priv_size     = sizeof(IEInferContext),
++    .query_formats = query_formats,
++    .activate      = activate,
++    .init          = infer_init,
++    .uninit        = infer_uninit,
++    .inputs        = infer_inputs,
++    .outputs       = infer_outputs,
++    .priv_class    = &inference_infer_class,
++    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
++};
+diff --git a/libavfilter/vf_inference_python.c b/libavfilter/vf_inference_python.c
+new file mode 100755
+index 0000000000..c73c4cac32
+--- /dev/null
++++ b/libavfilter/vf_inference_python.c
+@@ -0,0 +1,334 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * filter calls python script to do post processing for data from inference filter
++ */
++
++#include <Python.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/eval.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/mem.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++
++#include "avfilter.h"
++#include "filters.h"
++#include "formats.h"
++#include "internal.h"
++#include "libavcodec/avcodec.h"
++#include "libavformat/avformat.h"
++#include "libavutil/time.h"
++
++#include "inference_backend/ff_base_inference.h"
++
++#define OFFSET(x) offsetof(VAPythonContext, x)
++#define FLAGS (AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_FILTERING_PARAM)
++
++typedef struct PythonContext {
++    int initialized;
++    PyGILState_STATE state;
++} PythonContext;
++
++typedef struct VAPythonContext {
++    const AVClass *class;
++
++    char *ffmodule_path;
++    char *custom_script;
++    char *function_name;
++
++    PythonContext py_context;
++    PyObject *py_videoframe_class;
++    PyObject *py_class;
++    PyObject *py_function;
++} VAPythonContext;
++
++static int query_formats(AVFilterContext *context) {
++    AVFilterFormats *formats_list;
++    const enum AVPixelFormat pixel_formats[] = {AV_PIX_FMT_BGR24, AV_PIX_FMT_RGB24, AV_PIX_FMT_BGRA,
++                                                AV_PIX_FMT_BGR0,  AV_PIX_FMT_NV12,  AV_PIX_FMT_NONE};
++
++    formats_list = ff_make_format_list(pixel_formats);
++    if (!formats_list) {
++        av_log(context, AV_LOG_ERROR, "Could not create formats list\n");
++        return AVERROR(ENOMEM);
++    }
++
++    return ff_set_common_formats(context, formats_list);
++}
++
++static int config_input(AVFilterLink *inlink) {
++    int ret = 0;
++    // AVFilterContext *ctx = inlink->dst;
++    // VAPythonContext *s = ctx->priv;
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
++    if (desc == NULL)
++        return AVERROR(EINVAL);
++
++    return ret;
++}
++
++static inline int py_append_module_path(PyObject *sys_path, const char *module_path) {
++    PyObject *pyPath;
++
++    pyPath = PyUnicode_FromString(module_path);
++    PyList_Append(sys_path, pyPath);
++    Py_DECREF(pyPath);
++
++    fprintf(stderr, "append path %s!\n", module_path);
++
++    if (PyErr_Occurred()) {
++        PyErr_Print();
++        return -1;
++    }
++
++    return 0;
++}
++
++static inline PyObject *py_import_module(const char *module_name) {
++    PyObject *py_name = PyUnicode_FromString(module_name);
++    PyObject *py_module = PyImport_Import(py_name);
++
++    Py_DECREF(py_name);
++
++    fprintf(stderr, "import module %s!\n", module_name);
++
++    if (!py_module && PyErr_Occurred()) {
++        PyErr_Print();
++        fprintf(stderr, "Cannot find module %s!\n", module_name);
++        return NULL;
++    }
++
++    return py_module;
++}
++
++static inline PyObject *py_get_attr_string(PyObject *module, const char *string) {
++    PyObject *py_attr = PyObject_GetAttrString(module, string);
++
++    if (!py_attr && PyErr_Occurred()) {
++        PyErr_Print();
++        fprintf(stderr, "Get %s failed!\n", string);
++        return NULL;
++    }
++
++    return py_attr;
++}
++
++static av_cold int va_python_init(AVFilterContext *ctx) {
++    int ret = 0;
++    int initialized = 0;
++    PyGILState_STATE state = PyGILState_UNLOCKED;
++    VAPythonContext *s = ctx->priv;
++    PyObject *sys_path;
++    PyObject *ffmodule, *cusmodule;
++    PyObject *videoframe_class, *custom_class = NULL, *custom_func = NULL;
++    const char *custom_dir, *last_slash, *suffix, *filename;
++    size_t len;
++
++    static wchar_t tmp[] = L"";
++    static wchar_t *empty_argv[] = {tmp};
++
++    if (!s->ffmodule_path || !s->custom_script) {
++        av_log(ctx, AV_LOG_ERROR, "path for ffmpeg and custom python module are needed.");
++        return AVERROR(EINVAL);
++    }
++
++    if (!s->function_name) {
++        av_log(ctx, AV_LOG_ERROR, "function name cannot be null.");
++        return AVERROR(EINVAL);
++    }
++
++    // split user-specified python script into path and name
++    last_slash = strrchr(s->custom_script, '/');
++    if (!last_slash) {
++        av_log(ctx, AV_LOG_ERROR, "full path required.");
++        return AVERROR(EINVAL);
++    }
++    // get path of dir
++    len = (size_t)(last_slash - s->custom_script);
++    custom_dir = av_mallocz(len + 1);
++    if (!custom_dir)
++        return AVERROR(ENOMEM);
++    strncpy((char *)custom_dir, s->custom_script, len);
++    // get filename
++    suffix = strrchr(s->custom_script, '.');
++    if (suffix) {
++        len = suffix - last_slash - 1;
++        filename = av_mallocz(len + 1);
++        if (!filename) {
++            av_freep(&custom_dir);
++            return AVERROR(ENOMEM);
++        }
++        strncpy((char *)filename, last_slash + 1, len);
++    } else {
++        len = 0;
++        filename = last_slash + 1;
++    }
++
++    // init context
++    initialized = Py_IsInitialized();
++    if (initialized) {
++        state = PyGILState_Ensure();
++    } else {
++        Py_Initialize();
++    }
++
++    PySys_SetArgv(1, empty_argv);
++
++    // append module path to sys path
++    sys_path = PySys_GetObject("path");
++    if ((ret = py_append_module_path(sys_path, s->ffmodule_path)) < 0)
++        goto exit;
++    if ((ret = py_append_module_path(sys_path, custom_dir)) < 0)
++        goto exit;
++
++    // import modules by name
++    ffmodule = py_import_module("ffmpeg");
++    cusmodule = py_import_module(filename);
++    if (!ffmodule || !cusmodule) {
++        ret = AVERROR(EINVAL);
++        goto exit;
++    }
++
++    videoframe_class = py_get_attr_string(ffmodule, "VideoFrame");
++    custom_func = py_get_attr_string(cusmodule, s->function_name);
++    if (!videoframe_class || !custom_func) {
++        ret = AVERROR(EINVAL);
++        goto exit;
++    }
++
++    s->py_context.initialized = initialized;
++    s->py_context.state = state;
++    s->py_videoframe_class = videoframe_class;
++    s->py_function = custom_func;
++
++exit:
++    if (ffmodule)
++        Py_DECREF(ffmodule);
++
++    if (cusmodule)
++        Py_DECREF(cusmodule);
++
++    av_freep(&custom_dir);
++    if (len != 0)
++        av_freep(&filename);
++    return ret;
++}
++
++static av_cold void call_python(AVFrame *frame, PyObject *py_videoframe_class, PyObject *py_function) {
++    void *ptr = (void *)frame;
++    PyObject *pyFrame = PyObject_CallFunctionObjArgs(py_videoframe_class, PyLong_FromVoidPtr(ptr), NULL);
++    if (!pyFrame && PyErr_Occurred()) {
++        PyErr_Print();
++        return;
++    }
++
++    {
++        PyObject *pyFunc = py_function;
++        if (pyFunc && PyCallable_Check(pyFunc)) {
++            PyObject *args = Py_BuildValue("(O)", pyFrame);
++            PyObject *pValue = PyObject_CallObject(pyFunc, args);
++            if (pValue != NULL) {
++                // av_log(NULL, AV_LOG_ERROR, "Result of call: %ld\n", PyLong_AsLong(pValue));
++                Py_DECREF(pValue);
++            } else {
++                if (PyErr_Occurred())
++                    PyErr_Print();
++            }
++        } else {
++            if (PyErr_Occurred())
++                PyErr_Print();
++            fprintf(stderr, "Cannot find function get_width\n");
++        }
++    }
++}
++
++static av_cold void va_python_uninit(AVFilterContext *ctx) {
++    VAPythonContext *s = ctx->priv;
++
++    if (s->py_videoframe_class)
++        Py_DECREF(s->py_videoframe_class);
++
++    if (s->py_function)
++        Py_DECREF(s->py_function);
++
++    if (s->py_class)
++        Py_DECREF(s->py_class);
++
++    if (s->py_context.initialized) {
++        PyGILState_Release(s->py_context.state);
++    } else {
++        PyEval_SaveThread();
++        // Py_Finalize();
++    }
++}
++
++static int filter_frame(AVFilterLink *inlink, AVFrame *frame) {
++    AVFilterContext *ctx = inlink->dst;
++    VAPythonContext *s = ctx->priv;
++    AVFilterLink *outlink = inlink->dst->outputs[0];
++
++    PyGILState_STATE state = PyGILState_Ensure();
++
++    call_python(frame, s->py_videoframe_class, s->py_function);
++
++    PyGILState_Release(state);
++
++    return ff_filter_frame(outlink, frame);
++}
++
++static const AVOption inference_python_options[] = {
++    { "ffmodule_path", "path to ffmpeg python module", OFFSET(ffmodule_path), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
++    { "custom_script", "path to custom python script", OFFSET(custom_script), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
++    { "function_name", "callback function name, default: process_frame",
++        OFFSET(function_name), AV_OPT_TYPE_STRING, {.str = "process_frame"}, 0, 0, FLAGS },
++    { NULL } };
++
++AVFILTER_DEFINE_CLASS(inference_python);
++
++static const AVFilterPad va_python_inputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO,
++        .config_props = config_input,
++        .filter_frame = filter_frame,
++    },
++    { NULL } };
++
++static const AVFilterPad va_python_outputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO,
++    },
++    { NULL }};
++
++AVFilter ff_vf_inference_python = {
++    .name = "python",
++    .description = NULL_IF_CONFIG_SMALL("Video analytics post processing filter using Python."),
++    .priv_size = sizeof(VAPythonContext),
++    .query_formats = query_formats,
++    .init = va_python_init,
++    .uninit = va_python_uninit,
++    .inputs = va_python_inputs,
++    .outputs = va_python_outputs,
++    .priv_class = &inference_python_class,
++    // .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
++};
+diff --git a/libavutil/frame.c b/libavutil/frame.c
+index 90a586a2c9..17cb536cb3 100755
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -352,6 +352,8 @@ static int frame_copy_props(AVFrame *dst, const AVFrame *src, int force_copy)
+     dst->palette_has_changed    = src->palette_has_changed;
+     dst->sample_rate            = src->sample_rate;
+     dst->opaque                 = src->opaque;
++    dst->tm_in                  = src->tm_in;
++    dst->tm_out                 = src->tm_out;
+ #if FF_API_PKT_PTS
+ FF_DISABLE_DEPRECATION_WARNINGS
+     dst->pkt_pts                = src->pkt_pts;
+@@ -390,7 +392,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
+             && (src->width != dst->width || src->height != dst->height))
+             continue;
+         if (sd_src->type == AV_FRAME_DATA_INFERENCE_CLASSIFICATION ||
+-            sd_src->type == AV_FRAME_DATA_INFERENCE_DETECTION)
++            sd_src->type == AV_FRAME_DATA_INFERENCE_DETECTION || sd_src->type == AV_FRAME_DATA_INFERENCE_INFER)
+             keep_ref = 1;
+         if (force_copy && !keep_ref) {
+             sd_dst = av_frame_new_side_data(dst, sd_src->type,
+@@ -842,6 +844,7 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
+     case AV_FRAME_DATA_ICC_PROFILE:                 return "ICC profile";
+     case AV_FRAME_DATA_INFERENCE_CLASSIFICATION:    return "Inference classification metadata";
+     case AV_FRAME_DATA_INFERENCE_DETECTION:         return "Inference detection metadata";
++    case AV_FRAME_DATA_INFERENCE_INFER:             return "Inference tensor metadata";
+ #if FF_API_FRAME_QP
+     case AV_FRAME_DATA_QP_TABLE_PROPERTIES:         return "QP table properties";
+     case AV_FRAME_DATA_QP_TABLE_DATA:               return "QP table data";
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index d2f39eafd0..148393cafb 100755
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -146,6 +146,8 @@ enum AVFrameSideDataType {
+ 
+     AV_FRAME_DATA_INFERENCE_DETECTION,
+ 
++    AV_FRAME_DATA_INFERENCE_INFER,
++
+ #if FF_API_FRAME_QP
+     /**
+      * Implementation-specific description of the format of AV_FRAME_QP_TABLE_DATA.
+@@ -676,6 +678,8 @@ typedef struct AVFrame {
+      * for the target frame's private_ref field.
+      */
+     AVBufferRef *private_ref;
++
++    uint64_t tm_in, tm_out;
+ } AVFrame;
+ 
+ #if FF_API_FRAME_GET_SET
+diff --git a/python/ffmpeg/__init__.py b/python/ffmpeg/__init__.py
+new file mode 100755
+index 0000000000..74da718e5b
+--- /dev/null
++++ b/python/ffmpeg/__init__.py
+@@ -0,0 +1,7 @@
++# ==============================================================================
++# Copyright (C) 2018-2020 Intel Corporation
++#
++# SPDX-License-Identifier: MIT
++# ==============================================================================
++
++from .video_frame import VideoFrame
+diff --git a/python/ffmpeg/avutil.py b/python/ffmpeg/avutil.py
+new file mode 100755
+index 0000000000..4a573e4318
+--- /dev/null
++++ b/python/ffmpeg/avutil.py
+@@ -0,0 +1,15 @@
++# ==============================================================================
++# Copyright (C) 2018-2020 Intel Corporation
++#
++# SPDX-License-Identifier: MIT
++# ==============================================================================
++
++import ctypes
++from .ffmpeg_decls import AV_FRAME_POINTER, AVFrameSideDataType, AV_FRAME_SIDE_DATA_POINTER
++
++# libavutil
++libavutil = ctypes.CDLL("libavutil.so.56.31.100")
++
++libavutil.av_frame_get_side_data.argtypes = [
++    AV_FRAME_POINTER, AVFrameSideDataType]
++libavutil.av_frame_get_side_data.restype = AV_FRAME_SIDE_DATA_POINTER
+diff --git a/python/ffmpeg/ffmpeg_decls.py b/python/ffmpeg/ffmpeg_decls.py
+new file mode 100755
+index 0000000000..cb91c2ce74
+--- /dev/null
++++ b/python/ffmpeg/ffmpeg_decls.py
+@@ -0,0 +1,192 @@
++# ==============================================================================
++# Copyright (C) 2018-2020 Intel Corporation
++#
++# SPDX-License-Identifier: MIT
++# ==============================================================================
++
++from ctypes import *
++
++STRING = c_char_p
++
++# Enum AVPictureType
++AVPictureType = c_int
++AV_PICTURE_TYPE_NONE = 0
++AV_PICTURE_TYPE_I = 1
++AV_PICTURE_TYPE_P = 2
++AV_PICTURE_TYPE_B = 3
++AV_PICTURE_TYPE_S = 4
++AV_PICTURE_TYPE_SI = 5
++AV_PICTURE_TYPE_SP = 6
++AV_PICTURE_TYPE_BI = 7
++
++# Enum AVFrameSideDataType
++AVFrameSideDataType = c_int
++AV_FRAME_DATA_PANSCAN = 0
++AV_FRAME_DATA_A53_CC = 1
++AV_FRAME_DATA_STEREO3D = 2
++AV_FRAME_DATA_MATRIXENCODING = 3
++AV_FRAME_DATA_DOWNMIX_INFO = 4
++AV_FRAME_DATA_REPLAYGAIN = 5
++AV_FRAME_DATA_DISPLAYMATRIX = 6
++AV_FRAME_DATA_AFD = 7
++AV_FRAME_DATA_MOTION_VECTORS = 8
++AV_FRAME_DATA_SKIP_SAMPLES = 9
++AV_FRAME_DATA_AUDIO_SERVICE_TYPE = 10
++AV_FRAME_DATA_MASTERING_DISPLAY_METADATA = 11
++AV_FRAME_DATA_GOP_TIMECODE = 12
++AV_FRAME_DATA_SPHERICAL = 13
++AV_FRAME_DATA_CONTENT_LIGHT_LEVEL = 14
++AV_FRAME_DATA_ICC_PROFILE = 15
++AV_FRAME_DATA_INFERENCE_CLASSIFICATION = 16
++AV_FRAME_DATA_INFERENCE_DETECTION = 17
++AV_FRAME_DATA_INFERENCE_INFER = 18
++AV_FRAME_DATA_QP_TABLE_PROPERTIES = 19 #if FF_API_FRAME_QP
++AV_FRAME_DATA_QP_TABLE_DATA = 20 #if FF_API_FRAME_QP
++AV_FRAME_DATA_S12M_TIMECODE = 21
++AV_FRAME_DATA_DYNAMIC_HDR_PLUS = 22
++AV_FRAME_DATA_REGIONS_OF_INTEREST = 23
++
++
++AVPixelFormat = c_int
++AV_PIX_FMT_NONE = -1
++AV_PIX_FMT_RGB24 = 2
++AV_PIX_FMT_BGR24 = 3
++AV_PIX_FMT_NV12 = 23
++AV_PIX_FMT_BGRA = 29
++AV_PIX_FMT_BGR0 = 124
++
++class AVRational(Structure):
++    _fields_ = [('num', c_int),
++                ('den', c_int)]
++
++class AVBufferRef(Structure):
++    _fields_ = [('buffer', c_void_p), # AVBuffer *
++                ('data', POINTER(c_uint8)),
++                ('size', c_int)]
++
++AV_BUFFER_REF_POINTER = POINTER(AVBufferRef)
++
++class AVDictionary(Structure):
++    _fields_ = [('count', c_int),
++                ('elems', c_void_p)] # AVDictionaryEntry *
++
++class AVFrameSideData(Structure):
++    _fields_ = [('type', AVFrameSideDataType),
++                ('data', POINTER(c_uint8)),
++                ('size', c_int),
++                ('metadata', POINTER(AVDictionary)),
++                ('buf', POINTER(AVBufferRef))]
++
++AV_FRAME_SIDE_DATA_POINTER = POINTER(AVFrameSideData)
++
++class AVFrame(Structure):
++    _fields_ = [('data', POINTER(c_uint8) * 8),
++                ('linesize', c_int * 8),
++                ('extended_data', c_void_p), # uint8_t **extended_data
++                ('width', c_int),
++                ('height', c_int),
++                ('nb_samples', c_int),
++                ('format', c_int),
++                ('key_frame', c_int),
++                ('pict_type', AVPictureType),
++                ('sample_aspect_ratio', AVRational),
++                ('pts', c_int64),
++                ('pkt_pts', c_int64), # FF_API_PKT_PTS(LIBAVUTIL_VERSION_MAJOR < 57)
++                ('pkt_dts', c_int64),
++                ('coded_picture_number', c_int),
++                ('display_picture_number', c_int),
++                ('quality', c_int),
++                ('opaque', c_void_p),
++                ('error', c_uint64 * 8), # FF_API_ERROR_FRAME
++                ('repeat_pict', c_int),
++                ('interlaced_frame', c_int),
++                ('top_field_first', c_int),
++                ('palette_has_changed', c_int),
++                ('reordered_opaque', c_int64),
++                ('sample_rate', c_int),
++                ('channel_layout', c_uint64),
++                ('buf', POINTER(AVBufferRef) * 8),
++                ('extended_buf', POINTER(POINTER(AVBufferRef))),
++                ('nb_extended_buf', c_int),
++                ('side_data', POINTER(POINTER(AVFrameSideData))),
++                ('nb_side_data', c_int),
++                ('flags', c_int),
++                ('color_range', c_int),
++                ('color_primaries', c_int),
++                ('color_trc', c_int),
++                ('colorspace', c_int),
++                ('chroma_location', c_int),
++                ('best_effort_timestamp', c_int64),
++                ('pkt_pos', c_int64),
++                ('pkt_duration', c_int64),
++                ('metadata', POINTER(AVDictionary)),
++                ('decode_error_flags', c_int),
++                ('channels', c_int),
++                ('pkt_size', c_int),
++                ('qscale_table', POINTER(c_int8)), # FF_API_FRAME_QP
++                ('qstride', c_int), # FF_API_FRAME_QP
++                ('qscale_type', c_int), # FF_API_FRAME_QP
++                ('qp_table_buf', POINTER(AVBufferRef)), # FF_API_FRAME_QP
++                ('hw_frames_ctx', POINTER(AVBufferRef)),
++                ('opaque_ref', POINTER(AVBufferRef)),
++                ('crop_top', c_size_t),
++                ('crop_bottom', c_size_t),
++                ('crop_left', c_size_t),
++                ('crop_right', c_size_t),
++                ('private_ref', POINTER(AVBufferRef))]
++
++AV_FRAME_POINTER = POINTER(AVFrame)
++
++class IETensorMeta(Structure):
++    _fields_ = [('precision', c_char_p),
++                ('ranks', c_size_t),
++                ('dims', c_size_t * 8),
++                ('layout', c_char_p),
++                ('layer_name', c_char_p),
++                ('model_name', c_char_p),
++                ('buffer', AV_BUFFER_REF_POINTER)]
++
++INFER_TENSOR_POINTER = POINTER(IETensorMeta)
++
++class FFVideoRegionOfInterestMeta(Structure):
++    _fields_ = [('type_name', c_char * 16),
++                ('index', c_uint),
++                ('x', c_uint),
++                ('y', c_uint),
++                ('w', c_uint),
++                ('h', c_uint)]
++
++class InferDetection(Structure):
++    _fields_ = [('x_min', c_float),
++                ('y_min', c_float),
++                ('x_max', c_float),
++                ('y_max', c_float),
++                ('confidence', c_float),
++                ('label_id', c_int),
++                ('label_buf', AV_BUFFER_REF_POINTER),
++                ('roi_meta', FFVideoRegionOfInterestMeta),
++                ('tensor', IETensorMeta)]
++
++INFER_DETECTION_POINTER = POINTER(InferDetection)
++
++class BBoxesArray(Structure):
++    _fields_ = [('bbox', POINTER(INFER_DETECTION_POINTER)),
++                ('num', c_int)]
++
++BBOXES_ARRAY_POINTER = POINTER(BBoxesArray)
++
++class InferDetectionMeta(Structure):
++    _fields_ = [('bboxes', BBOXES_ARRAY_POINTER)]
++
++INFER_DETECTION_META_POINTER = POINTER(InferDetectionMeta)
++
++class TensorsArray(Structure):
++    _fields_ = [('tensors', POINTER(INFER_TENSOR_POINTER)),
++                ('num', c_int)]
++
++TENSORS_ARRAY_POINTER = POINTER(TensorsArray)
++
++class InferTensorMeta(Structure):
++    _fields_ = [('t_array', TENSORS_ARRAY_POINTER)]
++
++INFER_TENSOR_META_POINTER = POINTER(InferTensorMeta)
+diff --git a/python/ffmpeg/video_frame.py b/python/ffmpeg/video_frame.py
+new file mode 100755
+index 0000000000..d7d8096b3b
+--- /dev/null
++++ b/python/ffmpeg/video_frame.py
+@@ -0,0 +1,122 @@
++# ==============================================================================
++# Copyright (C) 2018-2020 Intel Corporation
++#
++# SPDX-License-Identifier: MIT
++# ==============================================================================
++
++import ctypes
++import numpy
++from contextlib import contextmanager
++from typing import List
++
++from .ffmpeg_decls import AVFrame, AV_FRAME_POINTER, \
++    AVFrameSideData, \
++    AVFrameSideDataType, AV_FRAME_DATA_INFERENCE_INFER, \
++    AVPixelFormat, AV_PIX_FMT_NV12, AV_PIX_FMT_BGRA, AV_PIX_FMT_BGR0, \
++    InferTensorMeta, INFER_TENSOR_META_POINTER, \
++    IETensorMeta, INFER_TENSOR_POINTER
++
++from .avutil import libavutil
++
++class Tensor:
++    def __init__(self, infer_tensor_p : INFER_TENSOR_POINTER):
++        self.__infer_p = infer_tensor_p
++        if not self.__infer_p:
++            raise ValueError("Tensor: infer_detect_ptr passed is nullptr")
++        self.__tensor = self.__infer_p.contents
++
++    def get_dims(self)->list:
++        dims = list(self.__tensor.dims)
++        rank = int(self.__tensor.ranks)
++        return dims[:rank]
++
++    def get_layout(self)->str:
++        return str(self.__tensor.layout, 'utf-8', 'ignore')
++
++    def get_precision(self)->str:
++        return str(self.__tensor.precision, 'utf-8', 'ignore')
++
++    def get_layer_name(self)->str:
++        return str(self.__tensor.layer_name, 'utf-8', 'ignore')
++
++    def get_model_name(self)->str:
++        return str(self.__tensor.model_name, 'utf-8', 'ignore')
++
++    def data(self) -> numpy.ndarray:
++        precision = str(self.__tensor.precision, 'utf-8', 'ignore')
++        if precision == "FP32":
++            view = numpy.float32
++        elif precision == "U8":
++            view = numpy.uint8
++        else:
++            raise ValueError("Tensor: precision unsupported")
++        av_bufferref_p = self.__tensor.buffer
++        if not av_bufferref_p:
++            return None
++        data_ptr = av_bufferref_p.contents.data
++        nbytes = av_bufferref_p.contents.size
++        array_type = ctypes.c_ubyte * int(nbytes)
++        return numpy.ctypeslib.as_array(array_type.from_address(ctypes.addressof(data_ptr.contents))).view(dtype=view)
++
++class VideoFrame:
++    def __init__(self, av_frame_addr):
++        self.__frame_p = ctypes.cast(av_frame_addr, AV_FRAME_POINTER)
++        self.__width = self.__frame_p.contents.width
++        self.__height = self.__frame_p.contents.height
++        self.__init_tensors()
++
++    ## @brief Get image width of this VideoFrame
++    #  @return width of the image
++    def get_width(self):
++        return self.__width
++
++    ## @brief Get image width of this VideoFrame
++    #  @return height of the image
++    def get_height(self):
++        return self.__height
++
++    ## @brief Get Tensor objects attached to VideoFrame
++    #  @return list of Tensor objects attached to VideoFrame
++    def get_tensors(self) -> List[Tensor]:
++        return self.__tensors[:] # copy list
++
++    @contextmanager
++    def data(self) -> numpy.ndarray:
++        pix_fmt = self.__frame_p.contents.format
++        w = self.__width
++        h = self.__height
++        channel = 3
++        if pix_fmt == AV_PIX_FMT_NV12:
++            h = int(h * 1.5)
++        elif pix_fmt == AV_PIX_FMT_BGRA or pix_fmt == AV_PIX_FMT_BGR0:
++            channel = 4
++        else:
++            raise RuntimeError("VideoFrame.data: Unsupported format")
++
++        size = w * h * channel
++        _data_ = self.__frame_p.contents.data[0]
++        data = ctypes.cast(_data_, ctypes.POINTER(ctypes.c_uint8 * size)).contents
++
++        try:
++            yield numpy.ndarray((h, w, channel), buffer=data, dtype=numpy.uint8)
++        except TypeError as e:
++            raise e
++
++    def __init_tensors(self):
++        side_data_p = libavutil.av_frame_get_side_data(self.__frame_p, AV_FRAME_DATA_INFERENCE_INFER)
++        if not side_data_p:
++            return
++        infer_meta_p = ctypes.cast(side_data_p.contents.data, INFER_TENSOR_META_POINTER)
++        if not infer_meta_p:
++            return
++        t_array_p = infer_meta_p.contents.t_array
++        if not t_array_p:
++            return
++        tensor_array_p = t_array_p.contents.tensors
++        number = t_array_p.contents.num
++        if not tensor_array_p or not number:
++            return
++        self.__tensors = list()
++        for i in range(number):
++            infer_tensor_p = tensor_array_p[i]
++            self.__tensors.append(Tensor(infer_tensor_p))
+-- 
+2.17.1
+
diff --git a/samples/model_proc/person-detection-retail-0002.json b/samples/model_proc/person-detection-retail-0002.json
new file mode 100644
index 0000000..754fb19
--- /dev/null
+++ b/samples/model_proc/person-detection-retail-0002.json
@@ -0,0 +1,11 @@
+{
+    "input_preproc": [{
+        "color_format": "BGR"
+    }],
+    "json_schema_version": 1,
+    "output_postproc": [{
+        "converter": "DetectionOutput",
+        "labels": [ "background", "person" ],
+        "layer_name": "detection_out"
+    }]
+}
diff --git a/samples/model_proc/person-reidentification-retail-0079.json b/samples/model_proc/person-reidentification-retail-0079.json
old mode 100755
new mode 100644