diff --git a/3rdparty/libwebp/CMakeLists.txt b/3rdparty/libwebp/CMakeLists.txt
index 80ab0b86ab76..9160e2024ca0 100644
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@@ -32,7 +32,9 @@ endif()
 #         Define the library target:
 # ----------------------------------------------------------------------------------
 
-add_definitions(-DWEBP_USE_THREAD)
+if(NOT OPENCV_DISABLE_THREAD_SUPPORT)
+  add_definitions(-DWEBP_USE_THREAD)
+endif()
 
 add_library(${WEBP_LIBRARY} STATIC ${OPENCV_3RDPARTY_EXCLUDE_FROM_ALL} ${lib_srcs} ${lib_hdrs})
 if(ANDROID)
diff --git a/3rdparty/readme.txt b/3rdparty/readme.txt
index 3b961782b097..4e4a6ba0a653 100644
--- a/3rdparty/readme.txt
+++ b/3rdparty/readme.txt
@@ -31,7 +31,7 @@ libpng                Portable Network Graphics library.
 libtiff               Tag Image File Format (TIFF) Software
                       Copyright (c) 1988-1997 Sam Leffler
                       Copyright (c) 1991-1997 Silicon Graphics, Inc.
-                      See libtiff home page http://www.remotesensing.org/libtiff/
+                      See libtiff home page http://www.libtiff.org/
                       for details and links to the source code
 
                       WITH_TIFF CMake option must be ON to add libtiff & zlib support to imgcodecs.
@@ -51,7 +51,9 @@ jasper                JasPer is a collection of software
                       Copyright (c) 1999-2000 The University of British Columbia
                       Copyright (c) 2001-2003 Michael David Adams
 
-                      The JasPer license can be found in libjasper.
+                      See JasPer official GitHub repository
+                      https://github.com/jasper-software/jasper.git
+                      for details and links to source code
 ------------------------------------------------------------------------------------
 openexr               OpenEXR is a high dynamic-range (HDR) image file format developed
                       by Industrial Light & Magic for use in computer imaging applications.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02749e4d7241..e8cd8105cf51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -515,6 +515,8 @@ OCV_OPTION(OPENCV_GENERATE_SETUPVARS  "Generate setup_vars* scripts" ON IF (NOT
 OCV_OPTION(ENABLE_CONFIG_VERIFICATION "Fail build if actual configuration doesn't match requested (WITH_XXX != HAVE_XXX)" OFF)
 OCV_OPTION(OPENCV_ENABLE_MEMALIGN     "Enable posix_memalign or memalign usage" ON)
 OCV_OPTION(OPENCV_DISABLE_FILESYSTEM_SUPPORT "Disable filesystem support" OFF)
+OCV_OPTION(OPENCV_DISABLE_THREAD_SUPPORT "Build the library without multi-threaded code." OFF)
+OCV_OPTION(OPENCV_SEMIHOSTING         "Build the library for semihosting target (Arm). See https://developer.arm.com/documentation/100863/latest." OFF)
 
 OCV_OPTION(ENABLE_PYLINT              "Add target with Pylint checks"                            (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
 OCV_OPTION(ENABLE_FLAKE8              "Add target with Python flake8 checker"                    (BUILD_DOCS OR BUILD_EXAMPLES) IF (NOT CMAKE_CROSSCOMPILING AND NOT APPLE_FRAMEWORK) )
@@ -661,7 +663,7 @@ if(UNIX)
     elseif(EMSCRIPTEN)
       # no need to link to system libs with emscripten
     elseif(QNXNTO)
-      # no need to link to system libs with QNX
+      set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} m)
     else()
       set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} dl m pthread rt)
     endif()
@@ -669,6 +671,11 @@ if(UNIX)
     set(HAVE_PTHREAD 1)
   endif()
 
+  # Ensure that libpthread is not listed as one of the libraries to pass to the linker.
+  if (OPENCV_DISABLE_THREAD_SUPPORT)
+    list(REMOVE_ITEM OPENCV_LINKER_LIBS pthread)
+  endif()
+
   if(OPENCV_ENABLE_MEMALIGN)
     CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
     CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
@@ -915,7 +922,7 @@ add_subdirectory(include)
 ocv_add_modules_compiler_options()
 
 # OpenCV modules
-add_subdirectory(modules)
+ocv_register_modules()
 
 # Generate targets for documentation
 add_subdirectory(doc)
@@ -1245,7 +1252,7 @@ endif(WIN32)
 
 # ========================== GUI ==========================
 status("")
-status("  GUI: ")
+status("  GUI: " "${OPENCV_HIGHGUI_BUILTIN_BACKEND}")
 
 if(WITH_QT OR HAVE_QT)
   if(HAVE_QT5)
@@ -1467,6 +1474,15 @@ ocv_build_features_string(parallel_status EXCLUSIVE
   ELSE "none")
 status("")
 status("  Parallel framework:" "${parallel_status}")
+if (OPENCV_DISABLE_THREAD_SUPPORT)
+  status("" "Multi thread code explicitly disabled with OPENCV_DISABLE_THREAD_SUPPORT.")
+  if(HAVE_PTHREADS_PF OR HAVE_HPX OR HAVE_OPENMP OR HAVE_GCD OR HAVE_CONCURRENCY)
+    message(FATAL_ERROR "Not all parallel frameworks have been disabled (using ${parallel_status}).")
+  endif()
+  if(HAVE_PTHREAD)
+    message(FATAL_ERROR "Thread execution might be in use in some component.")
+  endif()
+endif()
 
 if(CV_TRACE OR OPENCV_TRACE)
   ocv_build_features_string(trace_status EXCLUSIVE
diff --git a/apps/model-diagnostics/model_diagnostics.cpp b/apps/model-diagnostics/model_diagnostics.cpp
index 2ffeaa1ea5b9..6970c8507108 100644
--- a/apps/model-diagnostics/model_diagnostics.cpp
+++ b/apps/model-diagnostics/model_diagnostics.cpp
@@ -1,9 +1,10 @@
 /*************************************************
 USAGE:
-./model_diagnostics -m <onnx file location>
+./model_diagnostics -m <model file location>
 **************************************************/
 #include <opencv2/dnn.hpp>
 #include <opencv2/core/utils/filesystem.hpp>
+#include <opencv2/dnn/utils/debug_utils.hpp>
 
 #include <iostream>
 
@@ -32,7 +33,7 @@ static std::string checkFileExists(const std::string& fileName)
 }
 
 std::string diagnosticKeys =
-        "{ model m     | | Path to the model .onnx file. }"
+        "{ model m     | | Path to the model file. }"
         "{ config c    | | Path to the model configuration file. }"
         "{ framework f | | [Optional] Name of the model framework. }";
 
@@ -41,7 +42,7 @@ std::string diagnosticKeys =
 int main( int argc, const char** argv )
 {
     CommandLineParser argParser(argc, argv, diagnosticKeys);
-    argParser.about("Use this tool to run the diagnostics of provided ONNX model"
+    argParser.about("Use this tool to run the diagnostics of provided ONNX/TF model"
                     "to obtain the information about its support (supported layers).");
 
     if (argc == 1)
@@ -57,6 +58,7 @@ int main( int argc, const char** argv )
     CV_Assert(!model.empty());
 
     enableModelDiagnostics(true);
+    skipModelImport(true);
     redirectError(diagnosticsErrorCallback, NULL);
 
     Net ocvNet = readNet(model, config, frameworkId);
diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake
index a161b6eb8b64..2917dd33d5ee 100644
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -178,8 +178,17 @@ if(CV_GCC OR CV_CLANG)
     add_extra_compiler_option(-Wno-long-long)
   endif()
 
-  # We need pthread's
-  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CV_CLANG)) # TODO
+  # We need pthread's, unless we have explicitly disabled multi-thread execution.
+  if(NOT OPENCV_DISABLE_THREAD_SUPPORT
+      AND (
+        (UNIX
+          AND NOT ANDROID
+          AND NOT (APPLE AND CV_CLANG)
+          AND NOT EMSCRIPTEN
+        )
+        OR (EMSCRIPTEN AND WITH_PTHREADS_PF)  # https://github.com/opencv/opencv/issues/20285
+      )
+  ) # TODO
     add_extra_compiler_option(-pthread)
   endif()
 
diff --git a/cmake/OpenCVDetectCUDA.cmake b/cmake/OpenCVDetectCUDA.cmake
index d12a9e68ea8b..10f1288141d9 100644
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -112,7 +112,7 @@ if(CUDA_FOUND)
   if(CUDA_GENERATION)
     if(NOT ";${_generations};" MATCHES ";${CUDA_GENERATION};")
       string(REPLACE ";" ", " _generations "${_generations}")
-      message(FATAL_ERROR "ERROR: ${_generations} Generations are suppered.")
+      message(FATAL_ERROR "ERROR: ${_generations} Generations are supported.")
     endif()
     unset(CUDA_ARCH_BIN CACHE)
     unset(CUDA_ARCH_PTX CACHE)
diff --git a/cmake/OpenCVDetectHalide.cmake b/cmake/OpenCVDetectHalide.cmake
index 790f69205662..4828c299aead 100644
--- a/cmake/OpenCVDetectHalide.cmake
+++ b/cmake/OpenCVDetectHalide.cmake
@@ -9,9 +9,14 @@ set(HALIDE_ROOT_DIR "${HALIDE_ROOT_DIR}" CACHE PATH "Halide root directory")
 if(NOT HAVE_HALIDE)
   find_package(Halide QUIET) # Try CMake-based config files
   if(Halide_FOUND)
-    set(HALIDE_INCLUDE_DIRS "${Halide_INCLUDE_DIRS}" CACHE PATH "Halide include directories" FORCE)
-    set(HALIDE_LIBRARIES "${Halide_LIBRARIES}" CACHE PATH "Halide libraries" FORCE)
-    set(HAVE_HALIDE TRUE)
+    if(TARGET Halide::Halide)  # modern Halide scripts defines imported target
+      set(HALIDE_INCLUDE_DIRS "")
+      set(HALIDE_LIBRARIES "Halide::Halide")
+      set(HAVE_HALIDE TRUE)
+    else()
+      # using HALIDE_INCLUDE_DIRS / Halide_LIBRARIES
+      set(HAVE_HALIDE TRUE)
+    endif()
   endif()
 endif()
 
@@ -28,18 +33,15 @@ if(NOT HAVE_HALIDE AND HALIDE_ROOT_DIR)
   )
   if(HALIDE_LIBRARY AND HALIDE_INCLUDE_DIR)
     # TODO try_compile
-    set(HALIDE_INCLUDE_DIRS "${HALIDE_INCLUDE_DIR}" CACHE PATH "Halide include directories" FORCE)
-    set(HALIDE_LIBRARIES "${HALIDE_LIBRARY}" CACHE PATH "Halide libraries" FORCE)
+    set(HALIDE_INCLUDE_DIRS "${HALIDE_INCLUDE_DIR}")
+    set(HALIDE_LIBRARIES "${HALIDE_LIBRARY}")
     set(HAVE_HALIDE TRUE)
   endif()
-  if(NOT HAVE_HALIDE)
-    ocv_clear_vars(HALIDE_LIBRARIES HALIDE_INCLUDE_DIRS CACHE)
-  endif()
 endif()
 
 if(HAVE_HALIDE)
-  include_directories(${HALIDE_INCLUDE_DIRS})
+  if(HALIDE_INCLUDE_DIRS)
+    include_directories(${HALIDE_INCLUDE_DIRS})
+  endif()
   list(APPEND OPENCV_LINKER_LIBS ${HALIDE_LIBRARIES})
-else()
-  ocv_clear_vars(HALIDE_INCLUDE_DIRS HALIDE_LIBRARIES)
 endif()
diff --git a/cmake/OpenCVDetectInferenceEngine.cmake b/cmake/OpenCVDetectInferenceEngine.cmake
index 2c0296d63463..b9fd07bbfbb1 100644
--- a/cmake/OpenCVDetectInferenceEngine.cmake
+++ b/cmake/OpenCVDetectInferenceEngine.cmake
@@ -134,16 +134,21 @@ endif()
 # Add more features to the target
 
 if(INF_ENGINE_TARGET)
-  if(InferenceEngine_VERSION VERSION_GREATER_EQUAL "2021.4")
-    math(EXPR INF_ENGINE_RELEASE "${InferenceEngine_VERSION_MAJOR} * 1000000 + ${InferenceEngine_VERSION_MINOR} * 10000 + ${InferenceEngine_VERSION_PATCH} * 100")
+  if(DEFINED InferenceEngine_VERSION)
+    message(STATUS "InferenceEngine: ${InferenceEngine_VERSION}")
+    if(NOT INF_ENGINE_RELEASE AND NOT (InferenceEngine_VERSION VERSION_LESS "2021.4"))
+      math(EXPR INF_ENGINE_RELEASE_INIT "${InferenceEngine_VERSION_MAJOR} * 1000000 + ${InferenceEngine_VERSION_MINOR} * 10000 + ${InferenceEngine_VERSION_PATCH} * 100")
+    endif()
   endif()
-  if(NOT INF_ENGINE_RELEASE)
-    message(WARNING "InferenceEngine version has not been set, 2021.3 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
-    set(INF_ENGINE_RELEASE "2021030000")
+  if(NOT INF_ENGINE_RELEASE AND NOT INF_ENGINE_RELEASE_INIT)
+    message(WARNING "InferenceEngine version has not been set, 2021.4 will be used by default. Set INF_ENGINE_RELEASE variable if you experience build errors.")
+    set(INF_ENGINE_RELEASE_INIT "2021040000")
+  elseif(DEFINED INF_ENGINE_RELEASE)
+    set(INF_ENGINE_RELEASE_INIT "${INF_ENGINE_RELEASE}")
   endif()
-  set(INF_ENGINE_RELEASE "${INF_ENGINE_RELEASE}" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
+  set(INF_ENGINE_RELEASE "${INF_ENGINE_RELEASE_INIT}" CACHE STRING "Force IE version, should be in form YYYYAABBCC (e.g. 2020.1.0.2 -> 2020010002)")
   set_target_properties(${INF_ENGINE_TARGET} PROPERTIES
-    INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
+      INTERFACE_COMPILE_DEFINITIONS "HAVE_INF_ENGINE=1;INF_ENGINE_RELEASE=${INF_ENGINE_RELEASE}"
   )
 endif()
 
diff --git a/cmake/OpenCVDetectVTK.cmake b/cmake/OpenCVDetectVTK.cmake
index b8cf36007cf2..57c154475c67 100644
--- a/cmake/OpenCVDetectVTK.cmake
+++ b/cmake/OpenCVDetectVTK.cmake
@@ -1,34 +1,34 @@
-# VTK 9.0
 if(NOT VTK_FOUND)
-  find_package(VTK 9 QUIET NAMES vtk COMPONENTS
-    FiltersExtraction
-    FiltersSources
-    FiltersTexture
-    IOExport
-    IOGeometry
-    IOPLY
-    InteractionStyle
-    RenderingCore
-    RenderingLOD
-    RenderingOpenGL2
-    NO_MODULE)
-endif()
-
-# VTK 6.x components
-if(NOT VTK_FOUND)
-  find_package(VTK QUIET COMPONENTS vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport NO_MODULE)
-  IF(VTK_FOUND)
-    IF(VTK_RENDERING_BACKEND) #in vtk 7, the rendering backend is exported as a var.
-      find_package(VTK QUIET COMPONENTS vtkRendering${VTK_RENDERING_BACKEND} vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport vtkIOGeometry NO_MODULE)
-    ELSE(VTK_RENDERING_BACKEND)
-      find_package(VTK QUIET COMPONENTS vtkRenderingOpenGL vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport NO_MODULE)
-    ENDIF(VTK_RENDERING_BACKEND)
-  ENDIF(VTK_FOUND)
-endif()
-
-# VTK 5.x components
-if(NOT VTK_FOUND)
-  find_package(VTK QUIET COMPONENTS vtkCommon NO_MODULE)
+  find_package(VTK QUIET NAMES vtk VTK)
+  if(VTK_FOUND)
+    if(VTK_VERSION VERSION_EQUAL "9") # VTK 9.0
+      find_package(VTK 9 QUIET NAMES vtk COMPONENTS
+              FiltersExtraction
+              FiltersSources
+              FiltersTexture
+              IOExport
+              IOGeometry
+              IOPLY
+              InteractionStyle
+              RenderingCore
+              RenderingLOD
+              RenderingOpenGL2
+              NO_MODULE)
+    elseif(VTK_VERSION VERSION_GREATER "5") # VTK 6.x components
+      find_package(VTK QUIET COMPONENTS vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport NO_MODULE)
+      IF(VTK_FOUND)
+        IF(VTK_RENDERING_BACKEND) #in vtk 7, the rendering backend is exported as a var.
+          find_package(VTK QUIET COMPONENTS vtkRendering${VTK_RENDERING_BACKEND} vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport vtkIOGeometry NO_MODULE)
+        ELSE(VTK_RENDERING_BACKEND)
+          find_package(VTK QUIET COMPONENTS vtkRenderingOpenGL vtkInteractionStyle vtkRenderingLOD vtkIOPLY vtkFiltersTexture vtkRenderingFreeType vtkIOExport NO_MODULE)
+        ENDIF(VTK_RENDERING_BACKEND)
+      ENDIF(VTK_FOUND)
+    elseif(VTK_VERSION VERSION_EQUAL "5") # VTK 5.x components
+      find_package(VTK QUIET COMPONENTS vtkCommon NO_MODULE)
+    else()
+      set(VTK_FOUND FALSE)
+    endif()
+  endif()
 endif()
 
 if(NOT VTK_FOUND)
diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake
index 8030e8b0c0fc..c8ec55b58864 100644
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -2,15 +2,6 @@
 #  Detect 3rd-party GUI libraries
 # ----------------------------------------------------------------------------
 
-#--- Win32 UI ---
-ocv_clear_vars(HAVE_WIN32UI)
-if(WITH_WIN32UI)
-  try_compile(HAVE_WIN32UI
-    "${OpenCV_BINARY_DIR}"
-    "${OpenCV_SOURCE_DIR}/cmake/checks/win32uitest.cpp"
-    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=user32;gdi32")
-endif()
-
 # --- QT4/5 ---
 ocv_clear_vars(HAVE_QT HAVE_QT5)
 if(WITH_QT)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake
index 7c48aad9c295..9981620f2560 100644
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -254,7 +254,7 @@ function(_glob_locations out_paths out_names)
     list(LENGTH paths before)
     get_filename_component(path "${path}" ABSOLUTE)
     # Either module itself
-    if(NOT path STREQUAL CMAKE_CURRENT_SOURCE_DIR AND EXISTS "${path}/CMakeLists.txt")
+    if(NOT path STREQUAL "${OpenCV_SOURCE_DIR}/modules" AND EXISTS "${path}/CMakeLists.txt")
       get_filename_component(name "${path}" NAME)
       list(APPEND paths "${path}")
       list(APPEND names "${name}")
@@ -296,7 +296,7 @@ macro(_add_modules_1 paths names)
       list(GET ${names} ${i} __name)
       #message(STATUS "First pass: ${__name} => ${__path}")
       include("${__path}/cmake/init.cmake" OPTIONAL)
-      add_subdirectory("${__path}" "${CMAKE_CURRENT_BINARY_DIR}/.firstpass/${__name}")
+      add_subdirectory("${__path}" "${OpenCV_BINARY_DIR}/modules/.firstpass/${__name}")
     endforeach()
   endif()
 endmacro()
@@ -316,7 +316,7 @@ macro(_add_modules_2)
       endif()
       string(REGEX REPLACE "^opencv_" "" name "${m}")
       #message(STATUS "Second pass: ${name} => ${OPENCV_MODULE_${m}_LOCATION}")
-      add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${CMAKE_CURRENT_BINARY_DIR}/${name}")
+      add_subdirectory("${OPENCV_MODULE_${m}_LOCATION}" "${OpenCV_BINARY_DIR}/modules/${name}")
     endif()
     ocv_cmake_hook(POST_MODULES_CREATE_${the_module})
   endforeach()
@@ -369,7 +369,6 @@ macro(ocv_glob_modules main_root)
   __ocv_resolve_dependencies()
 
   # create modules
-  set(OPENCV_INITIAL_PASS OFF PARENT_SCOPE)
   set(OPENCV_INITIAL_PASS OFF)
   ocv_cmake_hook(PRE_MODULES_CREATE)
   _add_modules_2(${OPENCV_MODULES_BUILD})
@@ -377,6 +376,37 @@ macro(ocv_glob_modules main_root)
 endmacro()
 
 
+# called by root CMakeLists.txt
+macro(ocv_register_modules)
+  if(NOT OPENCV_MODULES_PATH)
+    set(OPENCV_MODULES_PATH "${OpenCV_SOURCE_DIR}/modules")
+  endif()
+
+  ocv_glob_modules(${OPENCV_MODULES_PATH} ${OPENCV_EXTRA_MODULES_PATH})
+
+  # build lists of modules to be documented
+  set(OPENCV_MODULES_MAIN "")
+  set(OPENCV_MODULES_EXTRA "")
+
+  foreach(mod ${OPENCV_MODULES_BUILD} ${OPENCV_MODULES_DISABLED_USER} ${OPENCV_MODULES_DISABLED_AUTO} ${OPENCV_MODULES_DISABLED_FORCE})
+    string(REGEX REPLACE "^opencv_" "" mod "${mod}")
+    if("${OPENCV_MODULE_opencv_${mod}_LOCATION}" STREQUAL "${OpenCV_SOURCE_DIR}/modules/${mod}")
+      list(APPEND OPENCV_MODULES_MAIN ${mod})
+    else()
+      list(APPEND OPENCV_MODULES_EXTRA ${mod})
+    endif()
+  endforeach()
+  ocv_list_sort(OPENCV_MODULES_MAIN)
+  ocv_list_sort(OPENCV_MODULES_EXTRA)
+  set(FIXED_ORDER_MODULES core imgproc imgcodecs videoio highgui video calib3d features2d objdetect dnn ml flann photo stitching)
+  list(REMOVE_ITEM OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES})
+  set(OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES} ${OPENCV_MODULES_MAIN})
+
+  set(OPENCV_MODULES_MAIN ${OPENCV_MODULES_MAIN} CACHE INTERNAL "List of main modules" FORCE)
+  set(OPENCV_MODULES_EXTRA ${OPENCV_MODULES_EXTRA} CACHE INTERNAL "List of extra modules" FORCE)
+endmacro()
+
+
 # disables OpenCV module with missing dependencies
 function(__ocv_module_turn_off the_module)
   list(REMOVE_ITEM OPENCV_MODULES_DISABLED_AUTO "${the_module}")
@@ -877,6 +907,7 @@ macro(ocv_create_module)
 endmacro()
 
 macro(_ocv_create_module)
+  add_definitions(-D__OPENCV_BUILD=1)
 
   ocv_compiler_optimization_process_sources(OPENCV_MODULE_${the_module}_SOURCES OPENCV_MODULE_${the_module}_DEPS_EXT ${the_module})
   set(__module_headers ${OPENCV_MODULE_${the_module}_HEADERS})
diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake
index 252078bdf776..932eb039b141 100644
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@@ -1481,8 +1481,8 @@ function(ocv_target_link_libraries target)
       if(NOT LINK_PENDING STREQUAL "")
         __ocv_push_target_link_libraries(${LINK_MODE} ${LINK_PENDING})
         set(LINK_PENDING "")
-        set(LINK_MODE "${dep}")
       endif()
+      set(LINK_MODE "${dep}")
     else()
       if(BUILD_opencv_world)
         if(OPENCV_MODULE_${dep}_IS_PART_OF_WORLD)
@@ -1973,3 +1973,9 @@ if(NOT BUILD_SHARED_LIBS AND (CMAKE_VERSION VERSION_LESS "3.14.0"))
 else()
   ocv_update(OPENCV_3RDPARTY_EXCLUDE_FROM_ALL "EXCLUDE_FROM_ALL")
 endif()
+
+
+#
+# Include configuration override settings
+#
+include("${CMAKE_CURRENT_LIST_DIR}/vars/EnableModeVars.cmake")
diff --git a/cmake/checks/cpu_rvv.cpp b/cmake/checks/cpu_rvv.cpp
index a3eab2abc44e..684b2ecbebfa 100644
--- a/cmake/checks/cpu_rvv.cpp
+++ b/cmake/checks/cpu_rvv.cpp
@@ -9,7 +9,7 @@
 int test()
 {
     const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f };
-    vfloat32m1_t val = vle32_v_f32m1((const float*)(src));
+    vfloat32m1_t val = vle32_v_f32m1((const float*)(src), 4);
     return (int)vfmv_f_s_f32m1_f32(val);
 }
 #else
diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in
index 99ec4802d2cb..39708e14bdbf 100644
--- a/cmake/templates/cvconfig.h.in
+++ b/cmake/templates/cvconfig.h.in
@@ -124,9 +124,6 @@
 /* TIFF codec */
 #cmakedefine HAVE_TIFF
 
-/* Win32 UI */
-#cmakedefine HAVE_WIN32UI
-
 /* Define if your processor stores words with the most significant byte
    first (like Motorola and SPARC, unlike Intel and VAX). */
 #cmakedefine WORDS_BIGENDIAN
diff --git a/cmake/vars/EnableModeVars.cmake b/cmake/vars/EnableModeVars.cmake
new file mode 100644
index 000000000000..3f017af496f2
--- /dev/null
+++ b/cmake/vars/EnableModeVars.cmake
@@ -0,0 +1,21 @@
+set(__OCV_MODE_VARS_DIR "${CMAKE_CURRENT_LIST_DIR}")
+
+macro(ocv_change_mode_var)
+  set(__var "${ARGV0}")
+  set(__mode "${ARGV1}")
+  set(__value "${ARGV2}")
+  if(__mode STREQUAL "MODIFIED_ACCESS" AND __value)
+    if(NOT __applied_mode_${__var})
+      include("${__OCV_MODE_VARS_DIR}/${__var}.cmake")
+      set(__applied_mode_${__var} 1)
+    else()
+      #message("Mode is already applied: ${__var}")
+    endif()
+  endif()
+endmacro()
+
+variable_watch(OPENCV_DISABLE_THREAD_SUPPORT ocv_change_mode_var)
+set(OPENCV_DISABLE_THREAD_SUPPORT "${OPENCV_DISABLE_THREAD_SUPPORT}")
+
+variable_watch(OPENCV_SEMIHOSTING ocv_change_mode_var)
+set(OPENCV_SEMIHOSTING "${OPENCV_SEMIHOSTING}")
diff --git a/cmake/vars/OPENCV_DISABLE_THREAD_SUPPORT.cmake b/cmake/vars/OPENCV_DISABLE_THREAD_SUPPORT.cmake
new file mode 100644
index 000000000000..5f5fc0204dfc
--- /dev/null
+++ b/cmake/vars/OPENCV_DISABLE_THREAD_SUPPORT.cmake
@@ -0,0 +1,28 @@
+# Force removal of code conditionally compiled with `#if
+# HAVE_PTHREAD`.
+ocv_update(HAVE_PTHREAD 0)
+
+# There components are disabled because they require
+# multi-threaded execution.
+ocv_update(WITH_PROTOBUF OFF)
+ocv_update(WITH_GSTREAMER OFF)
+ocv_update(WITH_IPP OFF)
+ocv_update(WITH_ITT OFF)
+ocv_update(WITH_OPENCL OFF)
+ocv_update(WITH_VA OFF)
+ocv_update(WITH_VA_INTEL OFF)
+
+# Disable bindings
+ocv_update(BUILD_opencv_python2 OFF)
+ocv_update(BUILD_opencv_python3 OFF)
+ocv_update(BUILD_JAVA OFF)
+ocv_update(BUILD_opencv_java OFF)
+
+# These modules require `#include
+# <[thread|mutex|condition_variable|future]>` and linkage into
+# `libpthread` to work.
+ocv_update(BUILD_opencv_objdetect OFF)
+ocv_update(BUILD_opencv_gapi OFF)
+ocv_update(BUILD_opencv_dnn OFF)
+
+set(OPJ_USE_THREAD "OFF" CACHE INTERNAL "")
diff --git a/cmake/vars/OPENCV_SEMIHOSTING.cmake b/cmake/vars/OPENCV_SEMIHOSTING.cmake
new file mode 100644
index 000000000000..66f21c7ebddc
--- /dev/null
+++ b/cmake/vars/OPENCV_SEMIHOSTING.cmake
@@ -0,0 +1,10 @@
+set(CV_TRACE OFF)
+
+# These third parties libraries are incompatible with the semihosting
+# toolchain.
+set(WITH_JPEG OFF)
+set(WITH_OPENEXR OFF)
+set(WITH_TIFF OFF)
+
+# Turn off `libpng` for some linking issues.
+set(WITH_PNG OFF)
diff --git a/doc/js_tutorials/js_imgproc/js_contours/js_contour_features/js_contour_features.markdown b/doc/js_tutorials/js_imgproc/js_contours/js_contour_features/js_contour_features.markdown
index 22544b280c60..842126958731 100644
--- a/doc/js_tutorials/js_imgproc/js_contours/js_contour_features/js_contour_features.markdown
+++ b/doc/js_tutorials/js_imgproc/js_contours/js_contour_features/js_contour_features.markdown
@@ -1,6 +1,9 @@
 Contour Features {#tutorial_js_contour_features}
 ================
 
+@prev_tutorial{tutorial_js_contours_begin}
+@next_tutorial{tutorial_js_contour_properties}
+
 Goal
 ----
 
diff --git a/doc/js_tutorials/js_imgproc/js_contours/js_contour_properties/js_contour_properties.markdown b/doc/js_tutorials/js_imgproc/js_contours/js_contour_properties/js_contour_properties.markdown
index 1dbb15c4cf3e..3392283305c0 100644
--- a/doc/js_tutorials/js_imgproc/js_contours/js_contour_properties/js_contour_properties.markdown
+++ b/doc/js_tutorials/js_imgproc/js_contours/js_contour_properties/js_contour_properties.markdown
@@ -1,6 +1,9 @@
 Contour Properties {#tutorial_js_contour_properties}
 ==================
 
+@prev_tutorial{tutorial_js_contour_features}
+@next_tutorial{tutorial_js_contours_more_functions}
+
 Goal
 ----
 
diff --git a/doc/js_tutorials/js_imgproc/js_contours/js_contours_begin/js_contours_begin.markdown b/doc/js_tutorials/js_imgproc/js_contours/js_contours_begin/js_contours_begin.markdown
index 9678a7c9f05d..3caf17f873a4 100644
--- a/doc/js_tutorials/js_imgproc/js_contours/js_contours_begin/js_contours_begin.markdown
+++ b/doc/js_tutorials/js_imgproc/js_contours/js_contours_begin/js_contours_begin.markdown
@@ -1,6 +1,8 @@
 Contours : Getting Started {#tutorial_js_contours_begin}
 ==========================
 
+@next_tutorial{tutorial_js_contour_features}
+
 Goal
 ----
 
diff --git a/doc/js_tutorials/js_imgproc/js_contours/js_contours_hierarchy/js_contours_hierarchy.markdown b/doc/js_tutorials/js_imgproc/js_contours/js_contours_hierarchy/js_contours_hierarchy.markdown
index c98628e48648..c2e408a96292 100644
--- a/doc/js_tutorials/js_imgproc/js_contours/js_contours_hierarchy/js_contours_hierarchy.markdown
+++ b/doc/js_tutorials/js_imgproc/js_contours/js_contours_hierarchy/js_contours_hierarchy.markdown
@@ -1,6 +1,8 @@
 Contours Hierarchy {#tutorial_js_contours_hierarchy}
 ==================
 
+@prev_tutorial{tutorial_js_contours_more_functions}
+
 Goal
 ----
 
diff --git a/doc/js_tutorials/js_imgproc/js_contours/js_contours_more_functions/js_contours_more_functions.markdown b/doc/js_tutorials/js_imgproc/js_contours/js_contours_more_functions/js_contours_more_functions.markdown
index b75311666271..941f0c486a1f 100644
--- a/doc/js_tutorials/js_imgproc/js_contours/js_contours_more_functions/js_contours_more_functions.markdown
+++ b/doc/js_tutorials/js_imgproc/js_contours/js_contours_more_functions/js_contours_more_functions.markdown
@@ -1,6 +1,9 @@
 Contours : More Functions {#tutorial_js_contours_more_functions}
 =========================
 
+@prev_tutorial{tutorial_js_contour_properties}
+@next_tutorial{tutorial_js_contours_hierarchy}
+
 Goal
 ----
 
diff --git a/doc/py_tutorials/py_imgproc/py_canny/py_canny.markdown b/doc/py_tutorials/py_imgproc/py_canny/py_canny.markdown
index cbc2a72eecc7..d36e5784ebc8 100644
--- a/doc/py_tutorials/py_imgproc/py_canny/py_canny.markdown
+++ b/doc/py_tutorials/py_imgproc/py_canny/py_canny.markdown
@@ -74,7 +74,7 @@ Canny Edge Detection in OpenCV
 
 OpenCV puts all the above in single function, **cv.Canny()**. We will see how to use it. First
 argument is our input image. Second and third arguments are our minVal and maxVal respectively.
-Third argument is aperture_size. It is the size of Sobel kernel used for find image gradients. By
+Fourth argument is aperture_size. It is the size of Sobel kernel used for find image gradients. By
 default it is 3. Last argument is L2gradient which specifies the equation for finding gradient
 magnitude. If it is True, it uses the equation mentioned above which is more accurate, otherwise it
 uses this function: \f$Edge\_Gradient \; (G) = |G_x| + |G_y|\f$. By default, it is False.
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
index 5af1e5a1e054..f3c7f6fc312a 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_features/py_contour_features.markdown
@@ -1,6 +1,9 @@
 Contour Features {#tutorial_py_contour_features}
 ================
 
+@prev_tutorial{tutorial_py_contours_begin}
+@next_tutorial{tutorial_py_contour_properties}
+
 Goal
 ----
 
@@ -91,7 +94,7 @@ convexity defects, which are the local maximum deviations of hull from contours.
 
 There is a little bit things to discuss about it its syntax:
 @code{.py}
-hull = cv.convexHull(points[, hull[, clockwise[, returnPoints]]
+hull = cv.convexHull(points[, hull[, clockwise[, returnPoints]]])
 @endcode
 Arguments details:
 
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contour_properties/py_contour_properties.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contour_properties/py_contour_properties.markdown
index 461c87034398..282f62ddf98e 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contour_properties/py_contour_properties.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contour_properties/py_contour_properties.markdown
@@ -1,6 +1,9 @@
 Contour Properties {#tutorial_py_contour_properties}
 ==================
 
+@prev_tutorial{tutorial_py_contour_features}
+@next_tutorial{tutorial_py_contours_more_functions}
+
 Here we will learn to extract some frequently used properties of objects like Solidity, Equivalent
 Diameter, Mask image, Mean Intensity etc. More features can be found at [Matlab regionprops
 documentation](http://www.mathworks.in/help/images/ref/regionprops.html).
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.markdown
index a2b89c1c9679..0049d3131d7d 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_begin/py_contours_begin.markdown
@@ -1,6 +1,8 @@
 Contours : Getting Started {#tutorial_py_contours_begin}
 ==========================
 
+@next_tutorial{tutorial_py_contour_features}
+
 Goal
 ----
 
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_hierarchy/py_contours_hierarchy.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contours_hierarchy/py_contours_hierarchy.markdown
index 2619ea2a7095..075e6ec81f93 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_hierarchy/py_contours_hierarchy.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_hierarchy/py_contours_hierarchy.markdown
@@ -1,6 +1,8 @@
 Contours Hierarchy {#tutorial_py_contours_hierarchy}
 ==================
 
+@prev_tutorial{tutorial_py_contours_more_functions}
+
 Goal
 ----
 
diff --git a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
index e50f9dd6c934..65f5b75401b9 100644
--- a/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
+++ b/doc/py_tutorials/py_imgproc/py_contours/py_contours_more_functions/py_contours_more_functions.markdown
@@ -1,6 +1,10 @@
 Contours : More Functions {#tutorial_py_contours_more_functions}
 =========================
 
+@prev_tutorial{tutorial_py_contour_properties}
+@next_tutorial{tutorial_py_contours_hierarchy}
+
+
 Goal
 ----
 
diff --git a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
index 4f6f2b8a887e..c53296b3bf98 100644
--- a/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
+++ b/doc/tutorials/core/mat_the_basic_image_container/mat_the_basic_image_container.markdown
@@ -91,8 +91,8 @@ a new header with the new boundaries:
 Mat D (A, Rect(10, 10, 100, 100) ); // using a rectangle
 Mat E = A(Range::all(), Range(1,3)); // using row and column boundaries
 @endcode
-Now you may ask -- if the matrix itself may belong to multiple *Mat* objects who takes responsibility
-for cleaning it up when it's no longer needed. The short answer is: the last object that used it.
+Now you may ask -- if the matrix itself may belong to multiple *Mat* objects, who takes responsibility
+for cleaning it up when it's no longer needed? The short answer is: the last object that used it.
 This is handled by using a reference counting mechanism. Whenever somebody copies a header of a
 *Mat* object, a counter is increased for the matrix. Whenever a header is cleaned, this counter
 is decreased. When the counter reaches zero the matrix is freed. Sometimes you will want to copy
@@ -102,12 +102,12 @@ Mat F = A.clone();
 Mat G;
 A.copyTo(G);
 @endcode
-Now modifying *F* or *G* will not affect the matrix pointed by the *A*'s header. What you need to
+Now modifying *F* or *G* will not affect the matrix pointed to by the *A*'s header. What you need to
 remember from all this is that:
 
 -   Output image allocation for OpenCV functions is automatic (unless specified otherwise).
 -   You do not need to think about memory management with OpenCV's C++ interface.
--   The assignment operator and the copy constructor only copies the header.
+-   The assignment operator and the copy constructor only copy the header.
 -   The underlying matrix of an image may be copied using the @ref cv::Mat::clone() and @ref cv::Mat::copyTo()
     functions.
 
@@ -122,10 +122,10 @@ of these allows us to create many shades of gray.
 For *colorful* ways we have a lot more methods to choose from. Each of them breaks it down to three
 or four basic components and we can use the combination of these to create the others. The most
 popular one is RGB, mainly because this is also how our eye builds up colors. Its base colors are
-red, green and blue. To code the transparency of a color sometimes a fourth element: alpha (A) is
+red, green and blue. To code the transparency of a color sometimes a fourth element, alpha (A), is
 added.
 
-There are, however, many other color systems each with their own advantages:
+There are, however, many other color systems, each with their own advantages:
 
 -   RGB is the most common as our eyes use something similar, however keep in mind that OpenCV standard display
     system composes colors using the BGR color space (red and blue channels are swapped places).
@@ -139,11 +139,11 @@ There are, however, many other color systems each with their own advantages:
 Each of the building components has its own valid domains. This leads to the data type used. How
 we store a component defines the control we have over its domain. The smallest data type possible is
 *char*, which means one byte or 8 bits. This may be unsigned (so can store values from 0 to 255) or
-signed (values from -127 to +127). Although in case of three components this already gives 16
-million possible colors to represent (like in case of RGB) we may acquire an even finer control by
+signed (values from -127 to +127). Although this width, in the case of three components (like RGB), already gives 16
+million possible colors to represent, we may acquire an even finer control by
 using the float (4 byte = 32 bit) or double (8 byte = 64 bit) data types for each component.
 Nevertheless, remember that increasing the size of a component also increases the size of the whole
-picture in the memory.
+picture in memory.
 
 Creating a Mat object explicitly
 ----------------------------------
diff --git a/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
index 5f28b6ce7a16..b0be2627b291 100644
--- a/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
+++ b/doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
@@ -26,6 +26,11 @@ Before recognition, you should `setVocabulary` and `setDecodeType`.
     - `T` is the sequence length
     - `B` is the batch size (only support `B=1` in inference)
     - and `Dim` is the length of vocabulary +1('Blank' of CTC is at the index=0 of Dim).
+- "CTC-prefix-beam-search", the output of the text recognition model should be a probability matrix same with "CTC-greedy".
+    - The algorithm is proposed at Hannun's [paper](https://arxiv.org/abs/1408.2873).
+    - `setDecodeOptsCTCPrefixBeamSearch` could be used to control the beam size in search step.
+    - To futher optimize for big vocabulary, a new option `vocPruneSize` is introduced to avoid iterate the whole vocbulary
+       but only the number of `vocPruneSize` tokens with top probabilty.
 
 @ref cv::dnn::TextRecognitionModel::recognize() is the main function for text recognition.
 - The input image should be a cropped text image or an image with `roiRects`
diff --git a/doc/tutorials/introduction/config_reference/config_reference.markdown b/doc/tutorials/introduction/config_reference/config_reference.markdown
index 58b4ed55ca41..0ba5627249ec 100644
--- a/doc/tutorials/introduction/config_reference/config_reference.markdown
+++ b/doc/tutorials/introduction/config_reference/config_reference.markdown
@@ -589,6 +589,14 @@ Some features have been added specifically for automated build environments, lik
 | `OPENCV_CMAKE_HOOKS_DIR` | _empty_ | OpenCV allows to customize configuration process by adding custom hook scripts at each stage and substage. cmake scripts with predefined names located in the directory set by this variable will be included before and after various configuration stages. Examples of file names: _CMAKE_INIT.cmake_, _PRE_CMAKE_BOOTSTRAP.cmake_, _POST_CMAKE_BOOTSTRAP.cmake_, etc.. Other names are not documented and can be found in the project cmake files by searching for the _ocv_cmake_hook_ macro calls. |
 | `OPENCV_DUMP_HOOKS_FLOW` | _OFF_ | Enables a debug message print on each cmake hook script call. |
 
+## Contrib Modules
+
+Following build options are utilized in `opencv_contrib` modules, as stated [previously](#tutorial_config_reference_general_contrib), these extra modules can be added to your final build by setting `DOPENCV_EXTRA_MODULES_PATH` option.
+
+| Option | Default | Description |
+| ------ | ------- | ----------- |
+| `WITH_CLP` | _OFF_ | Will add [coinor](https://projects.coin-or.org/Clp) linear programming library build support which is required in `videostab` module. Make sure to install the development libraries of coinor-clp. |
+
 
 # Other non-documented options
 
@@ -605,7 +613,6 @@ Some features have been added specifically for automated build environments, lik
 `WITH_CPUFEATURES`
 `WITH_EIGEN`
 `WITH_OPENVX`
-`WITH_CLP`
 `WITH_DIRECTX`
 `WITH_VA`
 `WITH_LAPACK`
diff --git a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
index eb59fea209d6..ee3f1eb7f9b9 100644
--- a/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
+++ b/doc/tutorials/introduction/linux_gcc_cmake/linux_gcc_cmake.markdown
@@ -1,7 +1,7 @@
 Using OpenCV with gcc and CMake {#tutorial_linux_gcc_cmake}
 ===============================
 
-@prev_tutorial{tutorial_linux_install}
+@prev_tutorial{tutorial_linux_gdb_pretty_printer}
 @next_tutorial{tutorial_linux_eclipse}
 
 |    |    |
diff --git a/doc/tutorials/introduction/linux_gdb_pretty_printer/images/example.png b/doc/tutorials/introduction/linux_gdb_pretty_printer/images/example.png
new file mode 100644
index 000000000000..0ec673dcc21d
Binary files /dev/null and b/doc/tutorials/introduction/linux_gdb_pretty_printer/images/example.png differ
diff --git a/doc/tutorials/introduction/linux_gdb_pretty_printer/linux_gdb_pretty_printer.markdown b/doc/tutorials/introduction/linux_gdb_pretty_printer/linux_gdb_pretty_printer.markdown
new file mode 100644
index 000000000000..9d6446992000
--- /dev/null
+++ b/doc/tutorials/introduction/linux_gdb_pretty_printer/linux_gdb_pretty_printer.markdown
@@ -0,0 +1,38 @@
+Using OpenCV with gdb-powered IDEs {#tutorial_linux_gdb_pretty_printer}
+=====================
+
+@prev_tutorial{tutorial_linux_install}
+@next_tutorial{tutorial_linux_gcc_cmake}
+
+|    |    |
+| -: | :- |
+| Original author | Egor Smirnov |
+| Compatibility | OpenCV >= 4.0 |
+
+@tableofcontents
+
+# Capabilities {#tutorial_linux_gdb_pretty_printer_capabilities}
+
+This pretty-printer can show element type, `is_continuous`, `is_submatrix` flags and (possibly truncated) matrix. It is known to work in Clion, VS Code and gdb.
+
+![Clion example](images/example.png)
+
+
+# Installation {#tutorial_linux_gdb_pretty_printer_installation}
+
+Move into `opencv/samples/gdb/`. Place `mat_pretty_printer.py` in a convinient place, rename `gdbinit` to `.gdbinit`  and move it into your home folder. Change 'source' line of `.gdbinit` to point to your `mat_pretty_printer.py` path.
+
+In order to check version of python bundled with your gdb, use the following commands from the gdb shell:
+
+    python
+    import sys
+    print(sys.version_info)
+    end
+
+If the version of python 3 installed in your system doesn't match the version in gdb, create a new virtual environment with the exact same version, install `numpy` and change the path to python3 in `.gdbinit` accordingly.
+
+
+# Usage {#tutorial_linux_gdb_pretty_printer_usage}
+
+The fields in a debugger prefixed with `view_` are pseudo-fields added for convinience, the rest are left as is.
+If you feel that the number of elements in truncated view is too low, you can edit `mat_pretty_printer.py` - `np.set_printoptions` controlls everything matrix display-related.
diff --git a/doc/tutorials/introduction/linux_install/linux_install.markdown b/doc/tutorials/introduction/linux_install/linux_install.markdown
index 5083fac282f8..e69f6ea70749 100644
--- a/doc/tutorials/introduction/linux_install/linux_install.markdown
+++ b/doc/tutorials/introduction/linux_install/linux_install.markdown
@@ -1,7 +1,7 @@
 Installation in Linux {#tutorial_linux_install}
 =====================
 
-@next_tutorial{tutorial_linux_gcc_cmake}
+@next_tutorial{tutorial_linux_gdb_pretty_printer}
 
 |    |    |
 | -: | :- |
diff --git a/doc/tutorials/introduction/table_of_content_introduction.markdown b/doc/tutorials/introduction/table_of_content_introduction.markdown
index d1f2aa3ca319..8fa89d7d7f9b 100644
--- a/doc/tutorials/introduction/table_of_content_introduction.markdown
+++ b/doc/tutorials/introduction/table_of_content_introduction.markdown
@@ -6,6 +6,7 @@ Introduction to OpenCV {#tutorial_table_of_content_introduction}
 
 ##### Linux
 -   @subpage tutorial_linux_install
+-   @subpage tutorial_linux_gdb_pretty_printer
 -   @subpage tutorial_linux_gcc_cmake
 -   @subpage tutorial_linux_eclipse
 
diff --git a/doc/tutorials/others/traincascade.markdown b/doc/tutorials/others/traincascade.markdown
index 0d95bd003a23..fdb38511beb5 100644
--- a/doc/tutorials/others/traincascade.markdown
+++ b/doc/tutorials/others/traincascade.markdown
@@ -13,6 +13,8 @@ Working with a boosted cascade of weak classifiers includes two major stages: th
 
 To support this tutorial, several official OpenCV applications will be used: [opencv_createsamples](https://github.com/opencv/opencv/tree/master/apps/createsamples), [opencv_annotation](https://github.com/opencv/opencv/tree/master/apps/annotation), [opencv_traincascade](https://github.com/opencv/opencv/tree/master/apps/traincascade) and [opencv_visualisation](https://github.com/opencv/opencv/tree/master/apps/visualisation).
 
+@note Createsamples and traincascade are disabled since OpenCV 4.0. Consider using these apps for training from 3.4 branch for Cascade Classifier. Model format is the same between 3.4 and 4.x.
+
 ### Important notes
 
  - If you come across any tutorial mentioning the old opencv_haartraining tool <i>(which is deprecated and still using the OpenCV1.x interface)</i>, then please ignore that tutorial and stick to the opencv_traincascade tool. This tool is a newer version, written in C++ in accordance to the OpenCV 2.x and OpenCV 3.x API. The opencv_traincascade supports both HAAR like wavelet features @cite Viola01 and LBP (Local Binary Patterns) @cite Liao2007 features. LBP features yield integer precision in contrast to HAAR features, yielding floating point precision, so both training and detection with LBP are several times faster then with HAAR features. Regarding the LBP and HAAR detection quality, it mainly depends on the training data used and the training parameters selected. It's possible to train a LBP-based classifier that will provide almost the same quality as HAAR-based one, within a percentage of the training time.
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
deleted file mode 100644
index 6a8004036b28..000000000000
--- a/modules/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-add_definitions(-D__OPENCV_BUILD=1)
-
-if(NOT OPENCV_MODULES_PATH)
-  set(OPENCV_MODULES_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
-endif()
-
-ocv_glob_modules(${OPENCV_MODULES_PATH} ${OPENCV_EXTRA_MODULES_PATH})
-
-# build lists of modules to be documented
-set(OPENCV_MODULES_MAIN "")
-set(OPENCV_MODULES_EXTRA "")
-
-foreach(mod ${OPENCV_MODULES_BUILD} ${OPENCV_MODULES_DISABLED_USER} ${OPENCV_MODULES_DISABLED_AUTO} ${OPENCV_MODULES_DISABLED_FORCE})
-  string(REGEX REPLACE "^opencv_" "" mod "${mod}")
-  if("${OPENCV_MODULE_opencv_${mod}_LOCATION}" STREQUAL "${OpenCV_SOURCE_DIR}/modules/${mod}")
-    list(APPEND OPENCV_MODULES_MAIN ${mod})
-  else()
-    list(APPEND OPENCV_MODULES_EXTRA ${mod})
-  endif()
-endforeach()
-ocv_list_sort(OPENCV_MODULES_MAIN)
-ocv_list_sort(OPENCV_MODULES_EXTRA)
-set(FIXED_ORDER_MODULES core imgproc imgcodecs videoio highgui video calib3d features2d objdetect dnn ml flann photo stitching)
-list(REMOVE_ITEM OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES})
-set(OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES} ${OPENCV_MODULES_MAIN})
-
-set(OPENCV_MODULES_MAIN ${OPENCV_MODULES_MAIN} CACHE INTERNAL "List of main modules" FORCE)
-set(OPENCV_MODULES_EXTRA ${OPENCV_MODULES_EXTRA} CACHE INTERNAL "List of extra modules" FORCE)
diff --git a/modules/calib3d/src/ap3p.cpp b/modules/calib3d/src/ap3p.cpp
index 386a4499efbe..582b201b36a1 100644
--- a/modules/calib3d/src/ap3p.cpp
+++ b/modules/calib3d/src/ap3p.cpp
@@ -7,8 +7,6 @@
 static inline double cbrt(double x) { return (double)cv::cubeRoot((float)x); };
 #endif
 
-using namespace std;
-
 namespace {
 void solveQuartic(const double *factors, double *realRoots) {
     const double &a4 = factors[0];
@@ -30,29 +28,29 @@ void solveQuartic(const double *factors, double *realRoots) {
     double q3 = (72 * r4 * p4 - 2 * p4 * p4 * p4 - 27 * q4 * q4) / 432; // /=2
 
     double t; // *=2
-    complex<double> w;
+    std::complex<double> w;
     if (q3 >= 0)
-        w = -sqrt(static_cast<complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
+        w = -std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
     else
-        w = sqrt(static_cast<complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
+        w = std::sqrt(static_cast<std::complex<double> >(q3 * q3 - p3 * p3 * p3)) - q3;
     if (w.imag() == 0.0) {
-        w.real(cbrt(w.real()));
+        w.real(std::cbrt(w.real()));
         t = 2.0 * (w.real() + p3 / w.real());
     } else {
         w = pow(w, 1.0 / 3);
         t = 4.0 * w.real();
     }
 
-    complex<double> sqrt_2m = sqrt(static_cast<complex<double> >(-2 * p4 / 3 + t));
+    std::complex<double> sqrt_2m = sqrt(static_cast<std::complex<double> >(-2 * p4 / 3 + t));
     double B_4A = -a3 / (4 * a4);
     double complex1 = 4 * p4 / 3 + t;
 #if defined(__clang__) && defined(__arm__) && (__clang_major__ == 3 || __clang_major__ == 4) && !defined(__ANDROID__)
     // details: https://github.com/opencv/opencv/issues/11135
     // details: https://github.com/opencv/opencv/issues/11056
-    complex<double> complex2 = 2 * q4;
-    complex2 = complex<double>(complex2.real() / sqrt_2m.real(), 0);
+    std::complex<double> complex2 = 2 * q4;
+    complex2 = std::complex<double>(complex2.real() / sqrt_2m.real(), 0);
 #else
-    complex<double> complex2 = 2 * q4 / sqrt_2m;
+    std::complex<double> complex2 = 2 * q4 / sqrt_2m;
 #endif
     double sqrt_2m_rh = sqrt_2m.real() / 2;
     double sqrt1 = sqrt(-(complex1 + complex2)).real() / 2;
diff --git a/modules/calib3d/src/chessboard.cpp b/modules/calib3d/src/chessboard.cpp
index dbc47722cba9..18e2605f53b5 100644
--- a/modules/calib3d/src/chessboard.cpp
+++ b/modules/calib3d/src/chessboard.cpp
@@ -3924,7 +3924,7 @@ bool findChessboardCornersSB(cv::InputArray image_, cv::Size pattern_size,
     {
         meta_.create(int(board.rowCount()),int(board.colCount()),CV_8UC1);
         cv::Mat meta = meta_.getMat();
-        meta = 0;
+        meta.setTo(cv::Scalar::all(0));
         for(int row =0;row < meta.rows-1;++row)
         {
             for(int col=0;col< meta.cols-1;++col)
diff --git a/modules/calib3d/test/test_undistort.cpp b/modules/calib3d/test/test_undistort.cpp
index 9663d36b7862..ea1a95207954 100644
--- a/modules/calib3d/test/test_undistort.cpp
+++ b/modules/calib3d/test/test_undistort.cpp
@@ -897,7 +897,7 @@ void CV_InitInverseRectificationMapTest::prepare_to_validation(int/* test_case_i
     Mat _new_cam0 = zero_new_cam ? test_mat[INPUT][0] : test_mat[INPUT][3];
     Mat _mapx(img_size, CV_32F), _mapy(img_size, CV_32F);
 
-    double a[9], d[5]={0,0,0,0,0}, R[9]={1, 0, 0, 0, 1, 0, 0, 0, 1}, a1[9];
+    double a[9], d[5]={0., 0., 0., 0. , 0.}, R[9]={1., 0., 0., 0., 1., 0., 0., 0., 1.}, a1[9];
     Mat _a(3, 3, CV_64F, a), _a1(3, 3, CV_64F, a1);
     Mat _d(_d0.rows,_d0.cols, CV_MAKETYPE(CV_64F,_d0.channels()),d);
     Mat _R(3, 3, CV_64F, R);
@@ -951,9 +951,9 @@ void CV_InitInverseRectificationMapTest::prepare_to_validation(int/* test_case_i
             // Undistort
             double x2 = x*x, y2 = y*y;
             double r2 = x2 + y2;
-            double cdist = 1./(1 + (d[0] + (d[1] + d[4]*r2)*r2)*r2); // (1 + (d[5] + (d[6] + d[7]*r2)*r2)*r2) == 1 as d[5-7]=0;
-            double x_ = x*cdist - d[2]*2*x*y + d[3]*(r2 + 2*x2);
-            double y_ = y*cdist - d[3]*2*x*y + d[2]*(r2 + 2*y2);
+            double cdist = 1./(1. + (d[0] + (d[1] + d[4]*r2)*r2)*r2); // (1. + (d[5] + (d[6] + d[7]*r2)*r2)*r2) == 1 as d[5-7]=0;
+            double x_ = (x - (d[2]*2.*x*y + d[3]*(r2 + 2.*x2)))*cdist;
+            double y_ = (y - (d[3]*2.*x*y + d[2]*(r2 + 2.*y2)))*cdist;
 
             // Rectify
             double X = R[0]*x_ + R[1]*y_ + R[2];
@@ -1807,4 +1807,78 @@ TEST(Calib3d_initUndistortRectifyMap, regression_14467)
     EXPECT_LE(cvtest::norm(dst, mesh_uv, NORM_INF), 1e-3);
 }
 
+TEST(Calib3d_initInverseRectificationMap, regression_20165)
+{
+    Size size_w_h(1280, 800);
+    Mat dst(size_w_h, CV_32FC2); // Reference for validation
+    Mat mapxy; // Output of initInverseRectificationMap()
+
+    // Camera Matrix
+    double k[9]={
+        1.5393951443032472e+03, 0., 6.7491727003047140e+02,
+        0., 1.5400748240626747e+03, 5.1226968329123963e+02,
+        0., 0., 1.
+    };
+    Mat _K(3, 3, CV_64F, k);
+
+    // Distortion
+    // double d[5]={0,0,0,0,0}; // Zero Distortion
+    double d[5]={ // Non-zero distortion
+        -3.4134571357400023e-03, 2.9733267766101856e-03, // K1, K2
+        3.6653586399031184e-03, -3.1960714017365702e-03, // P1, P2
+        0. // K3
+    };
+    Mat _d(1, 5, CV_64F, d);
+
+    // Rotation
+    //double R[9]={1., 0., 0., 0., 1., 0., 0., 0., 1.}; // Identity transform (none)
+    double R[9]={ // Random transform
+        9.6625486010428052e-01, 1.6055789378989216e-02, 2.5708706103628531e-01,
+        -8.0300261706161002e-03, 9.9944797497929860e-01, -3.2237617614807819e-02,
+       -2.5746274294459848e-01, 2.9085338870243265e-02, 9.6585039165403186e-01
+    };
+    Mat _R(3, 3, CV_64F, R);
+
+    // --- Validation --- //
+    initInverseRectificationMap(_K, _d, _R, _K, size_w_h, CV_32FC2, mapxy, noArray());
+
+    // Copy camera matrix
+    double fx, fy, cx, cy, ifx, ify, cxn, cyn;
+    fx = k[0]; fy = k[4]; cx = k[2]; cy = k[5];
+
+    // Copy new camera matrix
+    ifx = k[0]; ify = k[4]; cxn = k[2]; cyn = k[5];
+
+    // Distort Points
+    for( int v = 0; v < size_w_h.height; v++ )
+    {
+        for( int u = 0; u < size_w_h.width; u++ )
+        {
+            // Convert from image to pin-hole coordinates
+            double x = (u - cx)/fx;
+            double y = (v - cy)/fy;
+
+            // Undistort
+            double x2 = x*x, y2 = y*y;
+            double r2 = x2 + y2;
+            double cdist = 1./(1. + (d[0] + (d[1] + d[4]*r2)*r2)*r2); // (1. + (d[5] + (d[6] + d[7]*r2)*r2)*r2) == 1 as d[5-7]=0;
+            double x_ = (x - (d[2]*2.*x*y + d[3]*(r2 + 2.*x2)))*cdist;
+            double y_ = (y - (d[3]*2.*x*y + d[2]*(r2 + 2.*y2)))*cdist;
+
+            // Rectify
+            double X = R[0]*x_ + R[1]*y_ + R[2];
+            double Y = R[3]*x_ + R[4]*y_ + R[5];
+            double Z = R[6]*x_ + R[7]*y_ + R[8];
+            double x__ = X/Z;
+            double y__ = Y/Z;
+
+            // Convert from pin-hole to image coordinates
+            dst.at<Vec2f>(v, u) = Vec2f((float)(x__*ifx + cxn), (float)(y__*ify + cyn));
+        }
+    }
+
+    // Check Result
+    EXPECT_LE(cvtest::norm(dst, mapxy, NORM_INF), 2e-1);
+}
+
 }} // namespace
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index b2797ab31fc1..13d0af4db82f 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -153,6 +153,14 @@ if(OPENCV_CORE_EXCLUDE_C_API)
   ocv_target_compile_definitions(${the_module} PRIVATE "OPENCV_EXCLUDE_C_API=1")
 endif()
 
+if(OPENCV_DISABLE_THREAD_SUPPORT)
+  ocv_target_compile_definitions(${the_module} PUBLIC "OPENCV_DISABLE_THREAD_SUPPORT=1")
+endif()
+
+if(OPENCV_SEMIHOSTING)
+  ocv_target_compile_definitions(${the_module} PRIVATE "-DOPENCV_SEMIHOSTING")
+endif(OPENCV_SEMIHOSTING)
+
 if(HAVE_HPX)
   ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
 endif()
diff --git a/modules/core/include/opencv2/core/bindings_utils.hpp b/modules/core/include/opencv2/core/bindings_utils.hpp
index cf8bcdd62289..e5c60631ebe9 100644
--- a/modules/core/include/opencv2/core/bindings_utils.hpp
+++ b/modules/core/include/opencv2/core/bindings_utils.hpp
@@ -116,6 +116,12 @@ String dumpRange(const Range& argument)
     }
 }
 
+CV_WRAP static inline
+String testReservedKeywordConversion(int positional_argument, int lambda = 2, int from = 3)
+{
+    return format("arg=%d, lambda=%d, from=%d", positional_argument, lambda, from);
+}
+
 CV_WRAP static inline
 void testRaiseGeneralException()
 {
diff --git a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
index cb2140df585b..5b3319378103 100644
--- a/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_rvv.hpp
@@ -151,12 +151,14 @@ struct vint8mf4_t
 };
 
 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
-inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
+inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
 { \
+    CV_UNUSED(vl); \
     return _Tpvec(ptr); \
 } \
-inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
+inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
 { \
+    CV_UNUSED(vl); \
     for (int i = 0; i < n; ++i) \
     { \
             ptr[i] = v.val[i]; \
@@ -176,15 +178,14 @@ OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
 
 
 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
-inline _Tpwvec wcvt (_Tpvec v) \
+inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
 { \
     _wTp tmp[n]; \
     for (int i = 0; i < n; ++i) \
     { \
             tmp[i] = (_wTp)v.val[i]; \
     } \
-    vsetvlmax_e##width##m1(); \
-    return vle##width##_v_##suffix##m1(tmp); \
+    return vle##width##_v_##suffix##m1(tmp, vl); \
 }
 
 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
@@ -194,32 +195,34 @@ OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1,
 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
 
-inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
+inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
 {
+    CV_UNUSED(vl);
     return vuint8mf4_t(base);
 }
-inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base)
+inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
 {
+    CV_UNUSED(vl);
     return vint8mf4_t(base);
 }
 
-inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
+inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
 {
     ushort tmp[4];
     for (int i = 0; i < 4; ++i)
     {
             tmp[i] = (ushort)src.val[i];
     }
-    return vle16_v_u16mf2(tmp);
+    return vle16_v_u16mf2(tmp, vl);
 }
-inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
+inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
 {
     short tmp[4];
     for (int i = 0; i < 4; ++i)
     {
             tmp[i] = (short)src.val[i];
     }
-    return vle16_v_i16mf2(tmp);
+    return vle16_v_i16mf2(tmp, vl);
 }
 
 //////////// Types ////////////
@@ -232,8 +235,7 @@ struct v_uint8x16
     v_uint8x16() {}
     explicit v_uint8x16(vuint8m1_t v)
     {
-        vsetvlmax_e8m1();
-        vse8_v_u8m1(val, v);
+        vse8_v_u8m1(val, v, nlanes);
     }
     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
@@ -246,8 +248,7 @@ struct v_uint8x16
     }
     operator vuint8m1_t() const
     {
-        vsetvlmax_e8m1();
-        return vle8_v_u8m1(val);
+        return vle8_v_u8m1(val, nlanes);
     }
     uchar get0() const
     {
@@ -265,8 +266,7 @@ struct v_int8x16
     v_int8x16() {}
     explicit v_int8x16(vint8m1_t v)
     {
-        vsetvlmax_e8m1();
-        vse8_v_i8m1(val, v);
+        vse8_v_i8m1(val, v, nlanes);
     }
     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
@@ -279,8 +279,7 @@ struct v_int8x16
     }
     operator vint8m1_t() const
     {
-        vsetvlmax_e8m1();
-        return vle8_v_i8m1(val);
+        return vle8_v_i8m1(val, nlanes);
     }
     schar get0() const
     {
@@ -298,8 +297,7 @@ struct v_uint16x8
     v_uint16x8() {}
     explicit v_uint16x8(vuint16m1_t v)
     {
-        vsetvlmax_e16m1();
-        vse16_v_u16m1(val, v);
+        vse16_v_u16m1(val, v, nlanes);
     }
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
     {
@@ -311,8 +309,7 @@ struct v_uint16x8
     }
     operator vuint16m1_t() const
     {
-        vsetvlmax_e16m1();
-        return vle16_v_u16m1(val);
+        return vle16_v_u16m1(val, nlanes);
     }
     ushort get0() const
     {
@@ -330,8 +327,7 @@ struct v_int16x8
     v_int16x8() {}
     explicit v_int16x8(vint16m1_t v)
     {
-        vsetvlmax_e16m1();
-        vse16_v_i16m1(val, v);
+        vse16_v_i16m1(val, v, nlanes);
     }
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
@@ -343,8 +339,7 @@ struct v_int16x8
     }
     operator vint16m1_t() const
     {
-        vsetvlmax_e16m1();
-        return vle16_v_i16m1(val);
+        return vle16_v_i16m1(val, nlanes);
     }
     short get0() const
     {
@@ -362,8 +357,7 @@ struct v_uint32x4
     v_uint32x4() {}
     explicit v_uint32x4(vuint32m1_t v)
     {
-        vsetvlmax_e32m1();
-        vse32_v_u32m1(val, v);
+        vse32_v_u32m1(val, v, nlanes);
     }
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
     {
@@ -375,8 +369,7 @@ struct v_uint32x4
     }
     operator vuint32m1_t() const
     {
-        vsetvlmax_e32m1();
-        return vle32_v_u32m1(val);
+        return vle32_v_u32m1(val, nlanes);
     }
     unsigned get0() const
     {
@@ -394,8 +387,7 @@ struct v_int32x4
     v_int32x4() {}
     explicit v_int32x4(vint32m1_t v)
     {
-        vsetvlmax_e32m1();
-        vse32_v_i32m1(val, v);
+        vse32_v_i32m1(val, v, nlanes);
     }
     v_int32x4(int v0, int v1, int v2, int v3)
     {
@@ -407,8 +399,7 @@ struct v_int32x4
     }
     operator vint32m1_t() const
     {
-        vsetvlmax_e32m1();
-        return vle32_v_i32m1(val);
+        return vle32_v_i32m1(val, nlanes);
     }
     int get0() const
     {
@@ -425,8 +416,7 @@ struct v_float32x4
     v_float32x4() {}
     explicit v_float32x4(vfloat32m1_t v)
     {
-        vsetvlmax_e32m1();
-        vse32_v_f32m1(val, v);
+        vse32_v_f32m1(val, v, nlanes);
     }
     v_float32x4(float v0, float v1, float v2, float v3)
     {
@@ -438,8 +428,7 @@ struct v_float32x4
     }
     operator vfloat32m1_t() const
     {
-        vsetvlmax_e32m1();
-        return vle32_v_f32m1(val);
+        return vle32_v_f32m1(val, nlanes);
     }
     float get0() const
     {
@@ -456,8 +445,7 @@ struct v_uint64x2
     v_uint64x2() {}
     explicit v_uint64x2(vuint64m1_t v)
     {
-        vsetvlmax_e64m1();
-        vse64_v_u64m1(val, v);
+        vse64_v_u64m1(val, v, nlanes);
     }
     v_uint64x2(uint64 v0, uint64 v1)
     {
@@ -469,8 +457,7 @@ struct v_uint64x2
     }
     operator vuint64m1_t() const
     {
-        vsetvlmax_e64m1();
-        return vle64_v_u64m1(val);
+        return vle64_v_u64m1(val, nlanes);
     }
     uint64 get0() const
     {
@@ -488,8 +475,7 @@ struct v_int64x2
     v_int64x2() {}
     explicit v_int64x2(vint64m1_t v)
     {
-        vsetvlmax_e64m1();
-        vse64_v_i64m1(val, v);
+        vse64_v_i64m1(val, v, nlanes);
     }
     v_int64x2(int64 v0, int64 v1)
     {
@@ -501,8 +487,7 @@ struct v_int64x2
     }
     operator vint64m1_t() const
     {
-        vsetvlmax_e64m1();
-        return vle64_v_i64m1(val);
+        return vle64_v_i64m1(val, nlanes);
     }
     int64 get0() const
     {
@@ -521,8 +506,7 @@ struct v_float64x2
     v_float64x2() {}
     explicit v_float64x2(vfloat64m1_t v)
     {
-        vsetvlmax_e64m1();
-        vse64_v_f64m1(val, v);
+        vse64_v_f64m1(val, v, nlanes);
     }
     v_float64x2(double v0, double v1)
     {
@@ -534,8 +518,7 @@ struct v_float64x2
     }
     operator vfloat64m1_t() const
     {
-        vsetvlmax_e64m1();
-        return vle64_v_f64m1(val);
+        return vle64_v_f64m1(val, nlanes);
     }
     double get0() const
     {
@@ -549,42 +532,38 @@ struct v_float64x2
 
 //////////// Initial ////////////
 
-#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
 inline v_##_Tpvec v_setzero_##suffix1() \
 { \
-    vsetvlmax_e##width##m1(); \
-    return v_##_Tpvec(vzero_##suffix2##m1()); \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
 } \
 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
+    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
 
-#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
 inline v_##_Tpv v_setzero_##suffix() \
 { \
-    vsetvlmax_e##width##m1(); \
-    return v_##_Tpv(vzero_##suffix##m1()); \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
 } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
+    return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32)
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64)
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
 #endif
 
 //////////// Reinterpret ////////////
@@ -605,232 +584,214 @@ OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
 #endif
 
-#define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+#define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2, vl1, vl2) \
 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
 { \
-    vsetvlmax_e##width2##m1(); \
-    return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
+    return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val, vl2)); \
 } \
 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
 { \
-    vsetvlmax_e##width1##m1(); \
-    return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
-}
-
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
+    return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val, vl1)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8, 16, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16, 8, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32, 4, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32, 4, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32, 4, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64, 2, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32, 16, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64, 16, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64, 8, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64, 4, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32, 16, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64, 16, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64, 8, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64, 4, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32, 16, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64, 16, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8, 8, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64, 8, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8, 4, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16, 4, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64, 4, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8, 2, 16)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16, 2, 8)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32, 2, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32, 16, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32, 2, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32, 16, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32, 2, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
-OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64, 2, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64, 2, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64, 16, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64, 6, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64, 4, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64, 16, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64, 8, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64, 4, 2)
+OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64, 4, 2)
 #endif
 
 ////////////// Extract //////////////
 
-#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
 template <int s> \
 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
 } \
 template<int i> inline _Tp v_extract_n(_Tpvec v) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
 }
 
 
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
+} \
+template<int i> inline _Tp v_extract_n(_Tpvec v) \
+{ \
+    return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64)
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
 #endif
 
 ////////////// Load/Store //////////////
 
-#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { \
-    vsetvlmax_e8m1(); \
-    return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
+    return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr, 16)); \
 } \
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
+    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
 } \
 inline _Tpvec v_load_low(const _Tp* ptr) \
 { \
-    vsetvl_e##width##m1(hvl); \
-    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
-    vsetvlmax_e##width##m1(); \
+    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
     return res; \
 } \
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
 { \
-    vsetvlmax_e8m1(); \
-    vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
+    vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val, 16), 16); \
 } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    vse##width##_v_##suffix##m1(ptr, a); \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
 } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    vse##width##_v_##suffix##m1(ptr, a); \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
 } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
 { \
-    vsetvlmax_e##width##m1(); \
-    vse##width##_v_##suffix##m1(ptr, a); \
+    vse##width##_v_##suffix##m1(ptr, a, vl); \
 } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
-    vsetvlmax_e##width##m1(); \
-    vse##width##_v_##suffix##m1(tmp_ptr, a); \
-    for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
-    { \
-        ptr[i] = tmp_ptr[i]; \
-    } \
+    vse##width##_v_##suffix##m1(ptr, a, hvl); \
 } \
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
-    vsetvlmax_e##width##m1(); \
-    vse##width##_v_##suffix##m1(tmp_ptr, a); \
-    for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
-    { \
-        ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
-    } \
+    vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
 }
 
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
 #endif
 
 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
     {
         ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
         ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
     };
-    vsetvlmax_e8m1();
-    return v_int8x16(vle8_v_i8m1(elems));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 }
 inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
 
 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
     {
         ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
     };
-    vsetvlmax_e16m1();
-    return v_int16x8(vle16_v_i16m1(elems));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
 }
 inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
 
 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
     {
         ptr0[0], ptr0[1], ptr1[0], ptr1[1]
     };
-    vsetvlmax_e32m1();
-    return v_int32x4(vle32_v_i32m1(elems));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
 }
 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
     {
         ptr0[0], ptr0[1], ptr1[0], ptr1[1]
     };
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(elems));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
 }
 inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
 
 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
 {
-    int64 CV_DECL_ALIGNED(32) elems[2] =
+    int64 elems[2] =
     {
         ptr0[0], ptr1[0]
     };
-    vsetvlmax_e64m1();
-    return v_int64x2(vle64_v_i64m1(elems));
+    return v_int64x2(vle64_v_i64m1(elems, 2));
 }
 inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
 
 #if CV_SIMD128_64F
 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
     {
         ptr0[0], ptr1[0]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 #endif
 
@@ -839,7 +800,7 @@ inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
     {
         tab[idx[ 0]],
         tab[idx[ 1]],
@@ -858,12 +819,11 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
         tab[idx[14]],
         tab[idx[15]]
     };
-    vsetvlmax_e8m1();
-    return v_int8x16(vle8_v_i8m1(elems));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 }
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
@@ -882,12 +842,11 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
         tab[idx[7]],
         tab[idx[7] + 1]
     };
-    vsetvlmax_e8m1();
-    return v_int8x16(vle8_v_i8m1(elems));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
-    schar CV_DECL_ALIGNED(32) elems[16] =
+    schar elems[16] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
@@ -906,8 +865,7 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
         tab[idx[3] + 2],
         tab[idx[3] + 3]
     };
-    vsetvlmax_e8m1();
-    return v_int8x16(vle8_v_i8m1(elems));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 }
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
@@ -915,7 +873,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
 
 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
     {
         tab[idx[0]],
         tab[idx[1]],
@@ -926,12 +884,11 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
         tab[idx[6]],
         tab[idx[7]]
     };
-    vsetvlmax_e16m1();
-    return v_int16x8(vle16_v_i16m1(elems));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
@@ -942,12 +899,11 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
         tab[idx[3]],
         tab[idx[3] + 1]
     };
-    vsetvlmax_e16m1();
-    return v_int16x8(vle16_v_i16m1(elems));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
-    short CV_DECL_ALIGNED(32) elems[8] =
+    short elems[8] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
@@ -958,8 +914,7 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
         tab[idx[1] + 2],
         tab[idx[1] + 3]
     };
-    vsetvlmax_e16m1();
-    return v_int16x8(vle16_v_i16m1(elems));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -967,32 +922,29 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
 
 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
     {
         tab[idx[0]],
         tab[idx[1]],
         tab[idx[2]],
         tab[idx[3]]
     };
-    vsetvlmax_e32m1();
-    return v_int32x4(vle32_v_i32m1(elems));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    vsetvlmax_e32m1();
-    return v_int32x4(vle32_v_i32m1(elems));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    vsetvlmax_e32m1();
-    return v_int32x4(vle32_v_i32m1(tab + idx[0]));
+    return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
@@ -1001,94 +953,86 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
 
 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    int64_t CV_DECL_ALIGNED(32) elems[2] =
+    int64_t elems[2] =
     {
         tab[idx[0]],
         tab[idx[1]]
     };
-    vsetvlmax_e64m1();
-    return v_int64x2(vle64_v_i64m1(elems));
+    return v_int64x2(vle64_v_i64m1(elems, 2));
 }
 inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
 {
-    vsetvlmax_e64m1();
-    return v_int64x2(vle64_v_i64m1(tab + idx[0]));
+    return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
 }
 inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
 inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
 
 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
     {
         tab[idx[0]],
         tab[idx[1]],
         tab[idx[2]],
         tab[idx[3]]
     };
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(elems));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
     {
         tab[idx[0]],
         tab[idx[0] + 1],
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(elems));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
 }
 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(tab + idx[0]));
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
 }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    int elems[4] =
     {
         tab[v_extract_n<0>(idxvec)],
         tab[v_extract_n<1>(idxvec)],
         tab[v_extract_n<2>(idxvec)],
         tab[v_extract_n<3>(idxvec)]
     };
-    vsetvlmax_e32m1();
-    return v_int32x4(vle32_v_i32m1(elems));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    unsigned elems[4] =
     {
         tab[v_extract_n<0>(idxvec)],
         tab[v_extract_n<1>(idxvec)],
         tab[v_extract_n<2>(idxvec)],
         tab[v_extract_n<3>(idxvec)]
     };
-    vsetvlmax_e32m1();
-    return v_uint32x4(vle32_v_u32m1(elems));
+    return v_uint32x4(vle32_v_u32m1(elems, 4));
 }
 
 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
     {
         tab[v_extract_n<0>(idxvec)],
         tab[v_extract_n<1>(idxvec)],
         tab[v_extract_n<2>(idxvec)],
         tab[v_extract_n<3>(idxvec)]
     };
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(elems));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
 }
 
 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    int CV_DECL_ALIGNED(32) idx[4];
+    int idx[4];
     v_store_aligned(idx, idxvec);
 
     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
@@ -1098,35 +1042,32 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa
 #if CV_SIMD128_64F
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
     {
         tab[idx[0]],
         tab[idx[1]]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(tab + idx[0]));
+    return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
 }
 
 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double elems[2] =
     {
         tab[v_extract_n<0>(idxvec)],
         tab[v_extract_n<1>(idxvec)]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
 {
-    int CV_DECL_ALIGNED(32) idx[4] = {0};
+    int idx[4] = {0};
     v_store_aligned(idx, idxvec);
 
     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
@@ -1138,30 +1079,28 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 
 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
-    ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
+    ushort ptr[16] = {0};
     v_store(ptr, a);
     v_store(ptr + 8, b);
-    vsetvlmax_e8m1();
-    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
+    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
                            const v_uint32x4& c, const v_uint32x4& d)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
     v_store(ptr, a);
     v_store(ptr + 4, b);
     v_store(ptr + 8, c);
     v_store(ptr + 12, d);
-    vsetvlmax_e8m1();
-    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
                            const v_uint64x2& g, const v_uint64x2& h)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
+    uint64 ptr[16] = {0};
     v_store(ptr, a);
     v_store(ptr + 2, b);
     v_store(ptr + 4, c);
@@ -1170,95 +1109,89 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     v_store(ptr + 10, f);
     v_store(ptr + 12, g);
     v_store(ptr + 14, h);
-    vsetvlmax_e8m1();
-    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
+    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
 }
 
 ////////////// Arithmetics //////////////
-#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(intrin(a, b)); \
+    return _Tpvec(intrin(a, b, vl)); \
 } \
 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    a = _Tpvec(intrin(a, b)); \
+    a = _Tpvec(intrin(a, b, vl)); \
     return a; \
 }
 
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
 #endif
 
 
 ////////////// Bitwise logic //////////////
 
-#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
+OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
 inline _Tpvec operator ~ (const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vnot_v_##suffix##m1(a)); \
+    return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
 
 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
-    vsetvlmax_e32m1(); \
-    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
 } \
 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
 { \
-    vsetvlmax_e32m1(); \
-    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
+    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
     return a; \
 }
 
@@ -1268,21 +1201,18 @@ OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
 
 inline v_float32x4 operator ~ (const v_float32x4& a)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
 }
 
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
-    vsetvlmax_e64m1(); \
-    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
 } \
 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
 { \
-    vsetvlmax_e64m1(); \
-    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
+    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
     return a; \
 }
 
@@ -1292,119 +1222,110 @@ OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
 
 inline v_float64x2 operator ~ (const v_float64x2& a)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
 }
 #endif
 
 ////////////// Bitwise shifts //////////////
 
-#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
 inline _Tpvec operator << (const _Tpvec& a, int n) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 inline _Tpvec operator >> (const _Tpvec& a, int n) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
 }
 
-#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
 inline _Tpvec operator << (const _Tpvec& a, int n) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 inline _Tpvec operator >> (const _Tpvec& a, int n) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
 } \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
+    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
 
 
 ////////////// Comparison //////////////
 
-#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
+    uint64_t ones = -1; \
+    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
 }
 
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
-}
-
-#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
-
-#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
-
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
-
-
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64)
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32)
+    union { uint64 u; double d; } ones; ones.u = -1; \
+    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
 #endif
 
 inline v_float32x4 v_not_nan(const v_float32x4& a)
@@ -1417,121 +1338,126 @@ inline v_float64x2 v_not_nan(const v_float64x2& a)
 
 ////////////// Min/Max //////////////
 
-#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(intrin(a, b)); \
-}
-
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64)
+    return _Tpvec(intrin(a, b, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
 #endif
 
 ////////////// Arithmetics wrap //////////////
 
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
 
 ////////////// Reduce //////////////
 
-#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
 inline scalartype v_reduce_sum(const _Tpvec& a)  \
 { \
-    vsetvlmax_e##wwidth##m1(); \
-    _nwTpvec zero = vzero_##wsuffix##m1(); \
-    _nwTpvec res = vzero_##wsuffix##m1(); \
-    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
+    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
     return (scalartype)(_wTpvec(res).get0()); \
 }
 
 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 4, redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 4, redsum)
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
+    return (scalartype)(_wTpvec(res).get0()); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 8, fredsum)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 4, fredsum)
 #endif
 
 
-#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
 inline scalartype v_reduce_##func(const _Tpvec& a)  \
 { \
-    vsetvlmax_e##width##m1(); \
-    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
+    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
     return scalartype(res.get0()); \
 }
 
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
 
 
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                  const v_float32x4& c, const v_float32x4& d)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    float elems[4] =
     {
         v_reduce_sum(a),
         v_reduce_sum(b),
         v_reduce_sum(c),
         v_reduce_sum(d)
     };
-    vsetvlmax_e32m1();
-    return v_float32x4(vle32_v_f32m1(elems));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
 }
 
 ////////////// Square-Root //////////////
 
 inline v_float32x4 v_sqrt(const v_float32x4& x)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vfsqrt_v_f32m1(x));
+    return v_float32x4(vfsqrt_v_f32m1(x, 4));
 }
 
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
@@ -1543,8 +1469,7 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x)
 #if CV_SIMD128_64F
 inline v_float64x2 v_sqrt(const v_float64x2& x)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vfsqrt_v_f64m1(x));
+    return v_float64x2(vfsqrt_v_f64m1(x, 4));
 }
 
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
@@ -1556,29 +1481,25 @@ inline v_float64x2 v_invsqrt(const v_float64x2& x)
 
 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
 {
-    vsetvlmax_e32m1();
-    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
+    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
     return v_sqrt(x);
 }
 
 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
+    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
 }
 
 #if CV_SIMD128_64F
 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
-    vsetvlmax_e64m1();
-    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
+    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
     return v_sqrt(x);
 }
 
 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
+    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
 }
 #endif
 
@@ -1586,13 +1507,11 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
 
 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vfmacc_vv_f32m1(c, a, b));
+    return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
 }
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    vsetvlmax_e32m1();
-    return v_int32x4(vmacc_vv_i32m1(c, a, b));
+    return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
 }
 
 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
@@ -1608,8 +1527,7 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x
 #if CV_SIMD128_64F
 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vfmacc_vv_f64m1(c, a, b));
+    return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
 }
 
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
@@ -1620,24 +1538,22 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 ////////////// Check all/any //////////////
 
-#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
 inline bool v_check_all(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl)); \
     return (v.val[0] | v.val[1]) == 0; \
 } \
 inline bool v_check_any(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
+    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift, vl)); \
     return (v.val[0] | v.val[1]) != 0; \
 }
 
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
 
 
 inline bool v_check_all(const v_int8x16& a)
@@ -1690,16 +1606,15 @@ OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
 
-#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, vl) \
 inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
+    return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b), vl), 0, vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 4)
 
 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
 inline _Tprvec v_abs(const _Tpvec& a) \
@@ -1732,149 +1647,152 @@ OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
 
 ////////////// Select //////////////
 
-#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(merge(ne(mask, 0), b, a)); \
+    return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
-OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
+OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
+OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
 #endif
 
 ////////////// Rotate shift //////////////
 
-#define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { \
-    vsetvlmax_e##width##m1(); \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { CV_UNUSED(b); return a; }
 
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
 
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64)
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64)
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64)
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
 #endif
 
 ////////////// Convert to float //////////////
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
-    vsetvlmax_e32m1();
-    return v_float32x4(vfcvt_f_x_v_f32m1(a));
+    return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
 }
 
 #if CV_SIMD128_64F
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
     double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    vsetvlmax_e32m1();
-    return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
     double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    vsetvlmax_e32m1();
-    return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
     {
         ptr[0], ptr[1]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
+    double elems[2] =
     {
         ptr[2], ptr[3]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
     {
         ptr[0], ptr[1]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
-    double CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
-    double CV_DECL_ALIGNED(32) elems[2] =
+    double ptr[4] = {0};
+    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
+    double elems[2] =
     {
         ptr[2], ptr[3]
     };
-    vsetvlmax_e64m1();
-    return v_float64x2(vle64_v_f64m1(elems));
+    return v_float64x2(vle64_v_f64m1(elems, 2));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 {
-    vsetvlmax_e64m1();
-    return v_float64x2(vfcvt_f_x_v_f64m1(a));
+    return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
 }
 #endif
 
@@ -1907,7 +1825,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
 { \
-    _Tp CV_DECL_ALIGNED(32) elems0[4] = \
+    _Tp elems0[4] = \
     { \
         v_extract_n<0>(a0), \
         v_extract_n<0>(a1), \
@@ -1915,7 +1833,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
         v_extract_n<0>(a3) \
     }; \
     b0 = v_load(elems0); \
-    _Tp CV_DECL_ALIGNED(32) elems1[4] = \
+    _Tp elems1[4] = \
     { \
         v_extract_n<1>(a0), \
         v_extract_n<1>(a1), \
@@ -1923,7 +1841,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
         v_extract_n<1>(a3) \
     }; \
     b1 = v_load(elems1); \
-    _Tp CV_DECL_ALIGNED(32) elems2[4] = \
+    _Tp elems2[4] = \
     { \
         v_extract_n<2>(a0), \
         v_extract_n<2>(a1), \
@@ -1931,7 +1849,7 @@ inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
         v_extract_n<2>(a3) \
     }; \
     b2 = v_load(elems2); \
-    _Tp CV_DECL_ALIGNED(32) elems3[4] = \
+    _Tp elems3[4] = \
     { \
         v_extract_n<3>(a0), \
         v_extract_n<3>(a1), \
@@ -1947,11 +1865,11 @@ OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
 
 ////////////// Reverse //////////////
 
-#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
 inline _Tpvec v_reverse(const _Tpvec& a)  \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptra[_Tpvec::nlanes] = {0}; \
     v_store(ptra, a); \
     for (int i = 0; i < _Tpvec::nlanes; i++) \
     { \
@@ -1960,159 +1878,149 @@ inline _Tpvec v_reverse(const _Tpvec& a)  \
     return v_load(ptr); \
 }
 
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
 #endif
 
 //////////// Value reordering ////////////
 
-#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
 { \
-    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
     v_store_low(lptr, a); \
     v_store_high(hptr, a); \
-    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
-    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
+    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
+    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
 } \
 inline _Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
     v_store_low(lptr, a); \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
 } \
 inline _Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
+    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
     v_store_high(hptr, a); \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
 } \
 inline _Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
+    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
+OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
-    vsetvlmax_e32m1();
-    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
+    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
 }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
-    vsetvlmax_e32m1();
-    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
+    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
 }
 
 
-#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
     v_store(arr + _wTpvec::nlanes, b); \
-    vsetvlmax_e##width##m2(); \
-    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
+    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
 } \
 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
-    vsetvlmax_e##width##m2(); \
-    v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
 } \
 template<int n> inline \
 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
     v_store(arr + _wTpvec::nlanes, b); \
-    vsetvlmax_e##width##m2(); \
-    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
+    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
 } \
 template<int n> inline \
 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
-    vsetvlmax_e##width##m2(); \
-    v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
 }
 
-OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
-OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
-OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
-OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
+OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
 
 
-#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
     v_store(arr + _wTpvec::nlanes, b); \
-    vsetvlmax_e##width##m2(); \
-    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
 } \
 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
-    vsetvlmax_e##width##m2(); \
-    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
 } \
 template<int n> inline \
 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
     v_store(arr + _wTpvec::nlanes, b); \
-    vsetvlmax_e##width##m2(); \
-    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
+    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
 } \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
 { \
-    _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
+    _wTp arr[_Tpvec::nlanes] = {0}; \
     v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
-    vsetvlmax_e##width##m2(); \
-    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
+    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
+    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
 }
 
-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
 
 
-#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptra0, a0); \
     v_store(ptra1, a1); \
     int i; \
@@ -2131,16 +2039,16 @@ inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
     v_store_low(ptra, a); \
     v_store_low(ptrb, b); \
     return v_load_halves(ptra, ptrb); \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
     v_store_high(ptra, a); \
     v_store_high(ptrb, b); \
     return v_load_halves(ptra, ptrb); \
@@ -2151,23 +2059,23 @@ inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c,
     d = v_combine_high(a, b); \
 }
 
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16)
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32)
-OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64)
+OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
 #endif
 
 
-#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
     int i, i2; \
     for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
     { \
@@ -2179,9 +2087,9 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
     int i, i3; \
     for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
     { \
@@ -2196,10 +2104,10 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
                                 v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
     int i, i4; \
     for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
     { \
@@ -2217,8 +2125,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     int i, i2; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptra, a); \
     v_store(ptrb, b); \
     for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
@@ -2231,9 +2139,9 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
     int i, i3; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptra, a); \
     v_store(ptrb, b); \
     v_store(ptrc, c); \
@@ -2249,10 +2157,10 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
     int i, i4; \
-    _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptra, a); \
     v_store(ptrb, b); \
     v_store(ptrc, c); \
@@ -2267,8 +2175,8 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec&
 } \
 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptrvec, vec); \
     for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
     { \
@@ -2281,8 +2189,8 @@ inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
 } \
 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
+    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
     v_store(ptrvec, vec); \
     for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
     { \
@@ -2298,17 +2206,17 @@ inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
     return v_load(ptr); \
 }
 
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
 #endif
 
 //////////// PopCount ////////////
@@ -2336,9 +2244,9 @@ static const unsigned char popCountTable[] =
 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
 inline _rTpvec v_popcount(const _Tpvec& a) \
 { \
-    uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
+    uchar ptra[16] = {0}; \
     v_store(ptra, v_reinterpret_as_u8(a)); \
-    _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _rTp ptr[_Tpvec::nlanes] = {0}; \
     v_store(ptr, v_setzero_##suffix()); \
     for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
         ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
@@ -2356,21 +2264,20 @@ OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
 
 //////////// SignMask ////////////
 
-#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
 inline int v_signmask(const _Tpvec& a) \
 { \
     int mask = 0; \
-    vsetvlmax_e##width##m1(); \
-    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
+    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
         mask |= (int)(tmp.val[i]) << i; \
     return mask; \
 }
 
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
 
 inline int v_signmask(const v_int8x16& a)
 { return v_signmask(v_reinterpret_as_u8(a)); }
@@ -2393,7 +2300,7 @@ inline int v_signmask(const v_float64x2& a)
 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
 inline int v_scan_forward(const _Tpvec& a) \
 { \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
+    _Tp ptr[_Tpvec::nlanes] = {0}; \
     v_store(ptr, v_reinterpret_as_##suffix(a)); \
     for (int i = 0; i < _Tpvec::nlanes; i++) \
         if(int(ptr[i]) < 0) \
@@ -2416,28 +2323,29 @@ OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
 
 //////////// Pack triplets ////////////
 
-#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
-inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
-{ \
-    _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
-    _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
-    v_store(ptrvec, vec); \
-    for (int i = 0; i < _Tpvec::nlanes/4; i++) \
-    { \
-        ptr[3*i  ] = ptrvec[4*i  ]; \
-        ptr[3*i+1] = ptrvec[4*i+2]; \
-        ptr[3*i+2] = ptrvec[4*i+2]; \
-    } \
-    return v_load(ptr); \
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)vint8m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{
+    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
+}
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vint16m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{
+    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
 }
 
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
-OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
 
 
 ////// FP16 support ///////
@@ -2445,12 +2353,12 @@ OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
 #if CV_FP16
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
+    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
 }
 
 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
+    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
 }
 #else
 inline v_float32x4 v_load_expand(const float16_t* ptr)
@@ -2474,70 +2382,61 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 
 inline v_int32x4 v_round(const v_float32x4& a)
 {
-    vsetvlmax_e32m1();
-    return v_int32x4(vfcvt_x_f_v_i32m1(a));
+    return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
 }
 
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     v_float32x4 ZP5 = v_setall_f32(0.5f);
     v_float32x4 t = a - ZP5;
-    vsetvlmax_e32m1();
-    return v_int32x4(vfcvt_x_f_v_i32m1(t));
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
 }
 
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
     v_float32x4 ZP5 = v_setall_f32(0.5f);
     v_float32x4 t = a + ZP5;
-    vsetvlmax_e32m1();
-    return v_int32x4(vfcvt_x_f_v_i32m1(t));
+    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
 }
 
 inline v_int32x4 v_trunc(const v_float32x4& a)
 {
-    vsetvlmax_e32m1();
-    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
+    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
 }
 #if CV_SIMD128_64F
 inline v_int32x4 v_round(const v_float64x2& a)
 {
     double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
 }
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
     double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
 }
 
 inline v_int32x4 v_floor(const v_float64x2& a)
 {
     double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
 }
 
 inline v_int32x4 v_ceil(const v_float64x2& a)
 {
     double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
 }
 
 inline v_int32x4 v_trunc(const v_float64x2& a)
 {
     double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vsetvlmax_e64m2();
-    vfloat64m2_t tmp = vle64_v_f64m2(arr);
-    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
+    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
+    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
 }
 #endif
 
@@ -2547,19 +2446,17 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
     v_int32x4 t1, t2;
-    vsetvlmax_e32m2();
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
     v_load_deinterleave(ptr, t1, t2);
     return t1 + t2;
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int ptr[8] = {0};
     v_int32x4 t1, t2;
-    vsetvlmax_e32m2();
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
     v_load_deinterleave(ptr, t1, t2);
     return t1 + t2 + c;
 }
@@ -2567,19 +2464,17 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
     v_int64x2 t1, t2;
-    vsetvlmax_e64m2();
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
     v_load_deinterleave(ptr, t1, t2);
     return t1 + t2;
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
+    int64 ptr[4] = {0};
     v_int64x2 t1, t2;
-    vsetvlmax_e64m2();
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
     v_load_deinterleave(ptr, t1, t2);
     return t1 + t2 + c;
 }
@@ -2587,40 +2482,36 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
     v_uint32x4 t1, t2, t3, t4;
-    vsetvlmax_e32m4();
-    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4;
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
+    unsigned ptr[16] = {0};
     v_uint32x4 t1, t2, t3, t4;
-    vsetvlmax_e32m4();
-    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4 + c;
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
     v_int32x4 t1, t2, t3, t4;
-    vsetvlmax_e32m4();
-    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4;
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                   const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
+    int ptr[16] = {0};
     v_int32x4 t1, t2, t3, t4;
-    vsetvlmax_e32m4();
-    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4 + c;
 }
@@ -2628,39 +2519,35 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
     v_uint64x2 t1, t2, t3, t4;
-    vsetvlmax_e64m4();
-    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4;
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    uint64 ptr[8] = {0};
     v_uint64x2 t1, t2, t3, t4;
-    vsetvlmax_e64m4();
-    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4 + c;
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
     v_int64x2 t1, t2, t3, t4;
-    vsetvlmax_e64m4();
-    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4;
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                   const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
+    int64 ptr[8] = {0};
     v_int64x2 t1, t2, t3, t4;
-    vsetvlmax_e64m4();
-    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
     v_load_deinterleave(ptr, t1, t2, t3, t4);
     return t1 + t2 + t3 + t4 + c;
 }
@@ -2679,18 +2566,16 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
 // 16 >> 32
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e32m2();
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
     v_int32x4 t1 = v_load(ptr);
     v_int32x4 t2 = v_load(ptr+4);
     return t1 + t2;
 }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e32m2();
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
+    int ptr[8] = {0};
+    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
     v_int32x4 t1 = v_load(ptr);
     v_int32x4 t2 = v_load(ptr+4);
     return t1 + t2 + c;
@@ -2699,18 +2584,16 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
     v_int64x2 t1 = v_load(ptr);
     v_int64x2 t2 = v_load(ptr+2);
     return t1 + t2;
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
-    vsetvlmax_e64m2();
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
+    int64 ptr[4] = {0};
+    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
     v_int64x2 t1 = v_load(ptr);
     v_int64x2 t2 = v_load(ptr+2);
     return t1 + t2 + c;
@@ -2720,9 +2603,8 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
-    vsetvlmax_e32m4();
-    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
     v_uint32x4 t1 = v_load(ptr);
     v_uint32x4 t2 = v_load(ptr+4);
     v_uint32x4 t3 = v_load(ptr+8);
@@ -2731,9 +2613,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-    unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
-    vsetvlmax_e32m4();
-    vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
+    unsigned ptr[16] = {0};
+    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
     v_uint32x4 t1 = v_load(ptr);
     v_uint32x4 t2 = v_load(ptr+4);
     v_uint32x4 t3 = v_load(ptr+8);
@@ -2742,9 +2623,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
-    vsetvlmax_e32m4();
-    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
     v_int32x4 t1 = v_load(ptr);
     v_int32x4 t2 = v_load(ptr+4);
     v_int32x4 t3 = v_load(ptr+8);
@@ -2753,9 +2633,8 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-    int CV_DECL_ALIGNED(32) ptr[16] = {0};
-    vsetvlmax_e32m4();
-    vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
+    int ptr[16] = {0};
+    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
     v_int32x4 t1 = v_load(ptr);
     v_int32x4 t2 = v_load(ptr+4);
     v_int32x4 t3 = v_load(ptr+8);
@@ -2766,9 +2645,8 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, c
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e64m4();
-    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
     v_uint64x2 t1 = v_load(ptr);
     v_uint64x2 t2 = v_load(ptr+2);
     v_uint64x2 t3 = v_load(ptr+4);
@@ -2777,9 +2655,8 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
-    uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e64m4();
-    vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
+    uint64 ptr[8] = {0};
+    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
     v_uint64x2 t1 = v_load(ptr);
     v_uint64x2 t2 = v_load(ptr+2);
     v_uint64x2 t3 = v_load(ptr+4);
@@ -2788,9 +2665,8 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e64m4();
-    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
     v_int64x2 t1 = v_load(ptr);
     v_int64x2 t2 = v_load(ptr+2);
     v_int64x2 t3 = v_load(ptr+4);
@@ -2799,9 +2675,8 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
-    int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
-    vsetvlmax_e64m4();
-    vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
+    int64 ptr[8] = {0};
+    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
     v_int64x2 t1 = v_load(ptr);
     v_int64x2 t2 = v_load(ptr+2);
     v_int64x2 t3 = v_load(ptr+4);
@@ -2822,11 +2697,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    vsetvlmax_e32m1();
-    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
-    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
-    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
-    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
     return v_float32x4(res);
 }
 
@@ -2834,40 +2708,35 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
-    vsetvlmax_e32m1();
-    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
-    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
-    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
+    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
+    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
     return v_float32x4(res) + a;
 }
 
-#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
 { \
-    _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
-    vsetvlmax_e##width##m2(); \
-    vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
-    vsetvlmax_e##width##m1(); \
-    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
-    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
+    _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
+    vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
+    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
+    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
 }
 
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
 
 
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
-    vsetvlmax_e16m1();
-    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
+    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
-    vsetvlmax_e16m1();
-    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
+    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
 }
 
 
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 5e667b213242..6768be76834b 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -2451,7 +2451,8 @@ class CV_EXPORTS UMat
     //!  <0 - a diagonal from the lower half)
     UMat diag(int d=0) const;
     //! constructs a square diagonal matrix which main diagonal is vector "d"
-    static UMat diag(const UMat& d);
+    static UMat diag(const UMat& d, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat diag(const UMat& d) { return diag(d, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
 
     //! returns deep copy of the matrix, i.e. the data is copied
     UMat clone() const CV_NODISCARD;
@@ -2485,14 +2486,22 @@ class CV_EXPORTS UMat
     double dot(InputArray m) const;
 
     //! Matlab-style matrix initialization
-    static UMat zeros(int rows, int cols, int type);
-    static UMat zeros(Size size, int type);
-    static UMat zeros(int ndims, const int* sz, int type);
-    static UMat ones(int rows, int cols, int type);
-    static UMat ones(Size size, int type);
-    static UMat ones(int ndims, const int* sz, int type);
-    static UMat eye(int rows, int cols, int type);
-    static UMat eye(Size size, int type);
+    static UMat zeros(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat zeros(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat zeros(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat zeros(int rows, int cols, int type) { return zeros(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat zeros(Size size, int type) { return zeros(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat zeros(int ndims, const int* sz, int type) { return zeros(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat ones(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat ones(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat ones(int ndims, const int* sz, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat ones(int rows, int cols, int type) { return ones(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat ones(Size size, int type) { return ones(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat ones(int ndims, const int* sz, int type) { return ones(ndims, sz, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat eye(int rows, int cols, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat eye(Size size, int type, UMatUsageFlags usageFlags /*= USAGE_DEFAULT*/);
+    static UMat eye(int rows, int cols, int type) { return eye(rows, cols, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
+    static UMat eye(Size size, int type) { return eye(size, type, USAGE_DEFAULT); }  // OpenCV 5.0: remove abi compatibility overload
 
     //! allocates new matrix data unless the matrix already has specified size and type.
     // previous data is unreferenced if needed.
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index f9cc9e019a03..03666df5176b 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -235,7 +235,11 @@ class CV_EXPORTS_W_SIMPLE Device
 
     /**
      * @param d OpenCL handle (cl_device_id). clRetainDevice() is called on success.
-     */
+     *
+     * @note Ownership of the passed device is passed to OpenCV on success.
+     * The caller should additionally call `clRetainDevice` on it if it intends
+     * to continue using the device.
+      */
     static Device fromHandle(void* d);
 
     struct Impl;
@@ -826,11 +830,13 @@ class CV_EXPORTS_W OpenCLExecutionContext
     OpenCLExecutionContext cloneWithNewQueue() const;
 
     /** @brief Creates OpenCL execution context
-     * OpenCV will check if available OpenCL platform has platformName name, then assign context to
-     * OpenCV and call `clRetainContext` function. The deviceID device will be used as target device and
-     * new command queue will be created.
+     * OpenCV will check if available OpenCL platform has platformName name,
+     * then assign context to OpenCV.
+     * The deviceID device will be used as target device and a new command queue will be created.
      *
-     * @note Lifetime of passed handles is transferred to OpenCV wrappers on success
+     * @note On success, ownership of one reference of the context and device is taken.
+     * The caller should additionally call `clRetainContext` and/or `clRetainDevice`
+     * to increase the reference count if it wishes to continue using them.
      *
      * @param platformName name of OpenCL platform to attach, this string is used to check if platform is available to OpenCV at runtime
      * @param platformID ID of platform attached context was created for (cl_platform_id)
diff --git a/modules/core/include/opencv2/core/opencl/opencl_info.hpp b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
index 5e5c846ad059..3ead76e5c46e 100644
--- a/modules/core/include/opencv2/core/opencl/opencl_info.hpp
+++ b/modules/core/include/opencv2/core/opencl/opencl_info.hpp
@@ -144,6 +144,10 @@ static void dumpOpenCLInformation()
         DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
         DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
 
+        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+
         const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
         DUMP_CONFIG_PROPERTY("cv_ocl_current_hostUnifiedMemory", device.hostUnifiedMemory());
@@ -191,6 +195,9 @@ static void dumpOpenCLInformation()
 
         DUMP_MESSAGE_STDOUT("    Preferred vector width double = " << device.preferredVectorWidthDouble());
         DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthDouble", device.preferredVectorWidthDouble());
+
+        DUMP_MESSAGE_STDOUT("    Preferred vector width half = " << device.preferredVectorWidthHalf());
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_preferredVectorWidthHalf", device.preferredVectorWidthHalf());
     }
     catch (...)
     {
diff --git a/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp b/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
index 264def5f508b..04b0c4c6cb59 100644
--- a/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
+++ b/modules/core/include/opencv2/core/parallel/backend/parallel_for.tbb.hpp
@@ -38,7 +38,7 @@ static tbb::task_scheduler_init& getScheduler()
 }
 #endif
 
-/** OpenMP parallel_for API implementation
+/** TBB parallel_for API implementation
  *
  * @sa setParallelForBackend
  * @ingroup core_parallel_backend
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index f0368027aa6a..108c0d93e749 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -714,9 +714,27 @@ void Mat::forEach_impl(const Functor& operation) {
 /////////////////////////// Synchronization Primitives ///////////////////////////////
 
 #if !defined(_M_CEE)
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
 typedef std::recursive_mutex Mutex;
 typedef std::lock_guard<cv::Mutex> AutoLock;
-#endif
+#else // OPENCV_DISABLE_THREAD_SUPPORT
+// Custom (failing) implementation of `std::recursive_mutex`.
+struct Mutex {
+    void lock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+    void unlock(){
+        CV_Error(cv::Error::StsNotImplemented,
+                 "cv::Mutex is disabled by OPENCV_DISABLE_THREAD_SUPPORT=ON");
+    }
+};
+// Stub for cv::AutoLock when threads are disabled.
+struct AutoLock {
+    AutoLock(Mutex &) { }
+};
+#endif // OPENCV_DISABLE_THREAD_SUPPORT
+#endif // !defined(_M_CEE)
 
 
 /** @brief Designed for command line parsing
diff --git a/modules/core/include/opencv2/core/utils/filesystem.private.hpp b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
index ea2591c9de1d..72b2bb947968 100644
--- a/modules/core/include/opencv2/core/utils/filesystem.private.hpp
+++ b/modules/core/include/opencv2/core/utils/filesystem.private.hpp
@@ -16,8 +16,8 @@
 #      define OPENCV_HAVE_FILESYSTEM_SUPPORT 1
 #  elif defined(__APPLE__)
 #    include <TargetConditionals.h>
-#    if (defined(TARGET_OS_OSX) && TARGET_OS_OSX) || (!defined(TARGET_OS_OSX) && !TARGET_OS_IPHONE)
-#      define OPENCV_HAVE_FILESYSTEM_SUPPORT 1 // OSX only
+#    if (defined(TARGET_OS_OSX) && TARGET_OS_OSX) || (defined(TARGET_OS_IOS) && TARGET_OS_IOS)
+#      define OPENCV_HAVE_FILESYSTEM_SUPPORT 1 // OSX, iOS only
 #    endif
 #  else
      /* unknown */
diff --git a/modules/core/include/opencv2/core/version.hpp b/modules/core/include/opencv2/core/version.hpp
index 4757e30f9b30..f627d7147265 100644
--- a/modules/core/include/opencv2/core/version.hpp
+++ b/modules/core/include/opencv2/core/version.hpp
@@ -8,7 +8,7 @@
 #define CV_VERSION_MAJOR    4
 #define CV_VERSION_MINOR    5
 #define CV_VERSION_REVISION 3
-#define CV_VERSION_STATUS   "-pre"
+#define CV_VERSION_STATUS   "-dev"
 
 #define CVAUX_STR_EXP(__A)  #__A
 #define CVAUX_STR(__A)      CVAUX_STR_EXP(__A)
diff --git a/modules/core/misc/java/src/java/core+MatAt.kt b/modules/core/misc/java/src/java/core+MatAt.kt
index f48a3deaedf9..c81e21057f27 100644
--- a/modules/core/misc/java/src/java/core+MatAt.kt
+++ b/modules/core/misc/java/src/java/core+MatAt.kt
@@ -3,6 +3,16 @@ package org.opencv.core
 import org.opencv.core.Mat.*
 import java.lang.RuntimeException
 
+fun Mat.get(row: Int, col: Int, data: UByteArray)  = this.get(row, col, data.asByteArray())
+fun Mat.get(indices: IntArray, data: UByteArray)  = this.get(indices, data.asByteArray())
+fun Mat.put(row: Int, col: Int, data: UByteArray)  = this.put(row, col, data.asByteArray())
+fun Mat.put(indices: IntArray, data: UByteArray)  = this.put(indices, data.asByteArray())
+
+fun Mat.get(row: Int, col: Int, data: UShortArray)  = this.get(row, col, data.asShortArray())
+fun Mat.get(indices: IntArray, data: UShortArray)  = this.get(indices, data.asShortArray())
+fun Mat.put(row: Int, col: Int, data: UShortArray)  = this.put(row, col, data.asShortArray())
+fun Mat.put(indices: IntArray, data: UShortArray)  = this.put(indices, data.asShortArray())
+
 /***
  *  Example use:
  *
@@ -19,6 +29,7 @@ inline fun <reified T> Mat.at(row: Int, col: Int) : Atable<T> =
             col
         )
         UByte::class -> AtableUByte(this, row, col) as Atable<T>
+        UShort::class -> AtableUShort(this, row, col) as Atable<T>
         else -> throw RuntimeException("Unsupported class type")
     }
 
@@ -30,6 +41,7 @@ inline fun <reified T> Mat.at(idx: IntArray) : Atable<T> =
             idx
         )
         UByte::class -> AtableUByte(this, idx) as Atable<T>
+        UShort::class -> AtableUShort(this, idx) as Atable<T>
         else -> throw RuntimeException("Unsupported class type")
     }
 
@@ -38,46 +50,95 @@ class AtableUByte(val mat: Mat, val indices: IntArray): Atable<UByte> {
     constructor(mat: Mat, row: Int, col: Int) : this(mat, intArrayOf(row, col))
 
     override fun getV(): UByte {
-        val data = ByteArray(1)
-        mat[indices, data]
-        return data[0].toUByte()
+        val data = UByteArray(1)
+        mat.get(indices, data)
+        return data[0]
     }
 
     override fun setV(v: UByte) {
-        val data = byteArrayOf(v.toByte())
+        val data = ubyteArrayOf(v)
         mat.put(indices, data)
     }
 
     override fun getV2c(): Tuple2<UByte> {
-        val data = ByteArray(2)
-        mat[indices, data]
-        return Tuple2(data[0].toUByte(), data[1].toUByte())
+        val data = UByteArray(2)
+        mat.get(indices, data)
+        return Tuple2(data[0], data[1])
     }
 
     override fun setV2c(v: Tuple2<UByte>) {
-        val data = byteArrayOf(v._0.toByte(), v._1.toByte())
+        val data = ubyteArrayOf(v._0, v._1)
         mat.put(indices, data)
     }
 
     override fun getV3c(): Tuple3<UByte> {
-        val data = ByteArray(3)
-        mat[indices, data]
-        return Tuple3(data[0].toUByte(), data[1].toUByte(), data[2].toUByte())
+        val data = UByteArray(3)
+        mat.get(indices, data)
+        return Tuple3(data[0], data[1], data[2])
     }
 
     override fun setV3c(v: Tuple3<UByte>) {
-        val data = byteArrayOf(v._0.toByte(), v._1.toByte(), v._2.toByte())
+        val data = ubyteArrayOf(v._0, v._1, v._2)
         mat.put(indices, data)
     }
 
     override fun getV4c(): Tuple4<UByte> {
-        val data = ByteArray(4)
-        mat[indices, data]
-        return Tuple4(data[0].toUByte(), data[1].toUByte(), data[2].toUByte(), data[3].toUByte())
+        val data = UByteArray(4)
+        mat.get(indices, data)
+        return Tuple4(data[0], data[1], data[2], data[3])
     }
 
     override fun setV4c(v: Tuple4<UByte>) {
-        val data = byteArrayOf(v._0.toByte(), v._1.toByte(), v._2.toByte(), v._3.toByte())
+        val data = ubyteArrayOf(v._0, v._1, v._2, v._3)
+        mat.put(indices, data)
+    }
+}
+
+class AtableUShort(val mat: Mat, val indices: IntArray): Atable<UShort> {
+
+    constructor(mat: Mat, row: Int, col: Int) : this(mat, intArrayOf(row, col))
+
+    override fun getV(): UShort {
+        val data = UShortArray(1)
+        mat.get(indices, data)
+        return data[0]
+    }
+
+    override fun setV(v: UShort) {
+        val data = ushortArrayOf(v)
+        mat.put(indices, data)
+    }
+
+    override fun getV2c(): Tuple2<UShort> {
+        val data = UShortArray(2)
+        mat.get(indices, data)
+        return Tuple2(data[0], data[1])
+    }
+
+    override fun setV2c(v: Tuple2<UShort>) {
+        val data = ushortArrayOf(v._0, v._1)
+        mat.put(indices, data)
+    }
+
+    override fun getV3c(): Tuple3<UShort> {
+        val data = UShortArray(3)
+        mat.get(indices, data)
+        return Tuple3(data[0], data[1], data[2])
+    }
+
+    override fun setV3c(v: Tuple3<UShort>) {
+        val data = ushortArrayOf(v._0, v._1, v._2)
+        mat.put(indices, data)
+    }
+
+    override fun getV4c(): Tuple4<UShort> {
+        val data = UShortArray(4)
+        mat.get(indices, data)
+        return Tuple4(data[0], data[1], data[2], data[3])
+    }
+
+    override fun setV4c(v: Tuple4<UShort>) {
+        val data = ushortArrayOf(v._0, v._1, v._2, v._3)
         mat.put(indices, data)
     }
 }
diff --git a/modules/core/misc/objc/common/Mat.mm b/modules/core/misc/objc/common/Mat.mm
index 5d41a3622e71..045bd8393ea3 100644
--- a/modules/core/misc/objc/common/Mat.mm
+++ b/modules/core/misc/objc/common/Mat.mm
@@ -548,7 +548,7 @@ - (void)put:(uchar*)dest data:(NSArray<NSNumber*>*)data offset:(int)offset count
     if (depth == CV_8U) {
         putData(dest, count, ^uchar (int index) { return cv::saturate_cast<uchar>(data[offset + index].doubleValue);} );
     } else if (depth == CV_8S) {
-        putData(dest, count, ^char (int index) { return cv::saturate_cast<char>(data[offset + index].doubleValue);} );
+        putData(dest, count, ^schar (int index) { return cv::saturate_cast<schar>(data[offset + index].doubleValue);} );
     } else if (depth == CV_16U) {
         putData(dest, count, ^ushort (int index) { return cv::saturate_cast<ushort>(data[offset + index].doubleValue);} );
     } else if (depth == CV_16S) {
diff --git a/modules/core/misc/objc/common/MatExt.swift b/modules/core/misc/objc/common/MatExt.swift
index 5ce3a5e6fb56..a6ba548599d8 100644
--- a/modules/core/misc/objc/common/MatExt.swift
+++ b/modules/core/misc/objc/common/MatExt.swift
@@ -62,6 +62,21 @@ public extension Mat {
         }
     }
 
+    @discardableResult func get(indices:[Int32], data:inout [UInt8]) throws -> Int32 {
+        let channels = CvType.channels(Int32(type()))
+        if Int32(data.count) % channels != 0 {
+            try throwIncompatibleBufferSize(count: data.count, channels: channels)
+        } else if depth() != CvType.CV_8U {
+            try throwIncompatibleDataType(typeName: CvType.type(toString: type()))
+        }
+        let count = Int32(data.count)
+        return data.withUnsafeMutableBufferPointer { body in
+            body.withMemoryRebound(to: Int8.self) { reboundBody in
+                return __get(indices as [NSNumber], count: count, byteBuffer: reboundBody.baseAddress!)
+            }
+        }
+    }
+
     @discardableResult func get(indices:[Int32], data:inout [Double]) throws -> Int32 {
         let channels = CvType.channels(Int32(type()))
         if Int32(data.count) % channels != 0 {
@@ -114,10 +129,29 @@ public extension Mat {
         }
     }
 
+    @discardableResult func get(indices:[Int32], data:inout [UInt16]) throws -> Int32 {
+        let channels = CvType.channels(Int32(type()))
+        if Int32(data.count) % channels != 0 {
+            try throwIncompatibleBufferSize(count: data.count, channels: channels)
+        } else if depth() != CvType.CV_16U {
+            try throwIncompatibleDataType(typeName: CvType.type(toString: type()))
+        }
+        let count = Int32(data.count)
+        return data.withUnsafeMutableBufferPointer { body in
+            body.withMemoryRebound(to: Int16.self) { reboundBody in
+                return __get(indices as [NSNumber], count: count, shortBuffer: reboundBody.baseAddress!)
+            }
+        }
+    }
+
     @discardableResult func get(row: Int32, col: Int32, data:inout [Int8]) throws -> Int32 {
         return try get(indices: [row, col], data: &data)
     }
 
+    @discardableResult func get(row: Int32, col: Int32, data:inout [UInt8]) throws -> Int32 {
+        return try get(indices: [row, col], data: &data)
+    }
+
     @discardableResult func get(row: Int32, col: Int32, data:inout [Double]) throws -> Int32 {
         return try get(indices: [row, col], data: &data)
     }
@@ -134,6 +168,10 @@ public extension Mat {
         return try get(indices: [row, col], data: &data)
     }
 
+    @discardableResult func get(row: Int32, col: Int32, data:inout [UInt16]) throws -> Int32 {
+        return try get(indices: [row, col], data: &data)
+    }
+
     @discardableResult func put(indices:[Int32], data:[Int8]) throws -> Int32 {
         let channels = CvType.channels(Int32(type()))
         if Int32(data.count) % channels != 0 {
@@ -147,6 +185,21 @@ public extension Mat {
         }
     }
 
+    @discardableResult func put(indices:[Int32], data:[UInt8]) throws -> Int32 {
+        let channels = CvType.channels(Int32(type()))
+        if Int32(data.count) % channels != 0 {
+            try throwIncompatibleBufferSize(count: data.count, channels: channels)
+        } else if depth() != CvType.CV_8U {
+            try throwIncompatibleDataType(typeName: CvType.type(toString: type()))
+        }
+        let count = Int32(data.count)
+        return data.withUnsafeBufferPointer { body in
+            body.withMemoryRebound(to: Int8.self) { reboundBody in
+                return __put(indices as [NSNumber], count: count, byteBuffer: reboundBody.baseAddress!)
+            }
+        }
+    }
+
     @discardableResult func put(indices:[Int32], data:[Int8], offset: Int, length: Int32) throws -> Int32 {
         let channels = CvType.channels(Int32(type()))
         if Int32(data.count) % channels != 0 {
@@ -214,10 +267,29 @@ public extension Mat {
         }
     }
 
+    @discardableResult func put(indices:[Int32], data:[UInt16]) throws -> Int32 {
+        let channels = CvType.channels(Int32(type()))
+        if Int32(data.count) % channels != 0 {
+            try throwIncompatibleBufferSize(count: data.count, channels: channels)
+        } else if depth() != CvType.CV_16U {
+            try throwIncompatibleDataType(typeName: CvType.type(toString: type()))
+        }
+        let count = Int32(data.count)
+        return data.withUnsafeBufferPointer { body in
+            body.withMemoryRebound(to: Int16.self) { reboundBody in
+                return __put(indices as [NSNumber], count: count, shortBuffer: reboundBody.baseAddress!)
+            }
+        }
+    }
+
     @discardableResult func put(row: Int32, col: Int32, data:[Int8]) throws -> Int32 {
         return try put(indices: [row, col], data: data)
     }
 
+    @discardableResult func put(row: Int32, col: Int32, data:[UInt8]) throws -> Int32 {
+        return try put(indices: [row, col], data: data)
+    }
+
     @discardableResult func put(row: Int32, col: Int32, data: [Int8], offset: Int, length: Int32) throws -> Int32 {
         return try put(indices: [row, col], data: data, offset: offset, length: length)
     }
@@ -238,6 +310,10 @@ public extension Mat {
         return try put(indices: [row, col], data: data)
     }
 
+    @discardableResult func put(row: Int32, col: Int32, data: [UInt16]) throws -> Int32 {
+        return try put(indices: [row, col], data: data)
+    }
+
     @discardableResult func get(row: Int32, col: Int32) -> [Double] {
         return get(indices: [row, col])
     }
@@ -303,46 +379,46 @@ public class MatAt<N: Atable> {
 
 extension UInt8: Atable {
     public static func getAt(m: Mat, indices:[Int32]) -> UInt8 {
-        var tmp = [Int8](repeating: 0, count: 1)
+        var tmp = [UInt8](repeating: 0, count: 1)
         try! m.get(indices: indices, data: &tmp)
-        return UInt8(bitPattern: tmp[0])
+        return tmp[0]
     }
 
     public static func putAt(m: Mat, indices: [Int32], v: UInt8) {
-        let tmp = [Int8(bitPattern: v)]
+        let tmp = [v]
         try! m.put(indices: indices, data: tmp)
     }
 
     public static func getAt2c(m: Mat, indices:[Int32]) -> (UInt8, UInt8) {
-        var tmp = [Int8](repeating: 0, count: 2)
+        var tmp = [UInt8](repeating: 0, count: 2)
         try! m.get(indices: indices, data: &tmp)
-        return (UInt8(bitPattern: tmp[0]), UInt8(bitPattern: tmp[1]))
+        return (tmp[0], tmp[1])
     }
 
     public static func putAt2c(m: Mat, indices: [Int32], v: (UInt8, UInt8)) {
-        let tmp = [Int8(bitPattern: v.0), Int8(bitPattern: v.1)]
+        let tmp = [v.0, v.1]
         try! m.put(indices: indices, data: tmp)
     }
 
     public static func getAt3c(m: Mat, indices:[Int32]) -> (UInt8, UInt8, UInt8) {
-        var tmp = [Int8](repeating: 0, count: 3)
+        var tmp = [UInt8](repeating: 0, count: 3)
         try! m.get(indices: indices, data: &tmp)
-        return (UInt8(bitPattern: tmp[0]), UInt8(bitPattern: tmp[1]), UInt8(bitPattern: tmp[2]))
+        return (tmp[0], tmp[1], tmp[2])
     }
 
     public static func putAt3c(m: Mat, indices: [Int32], v: (UInt8, UInt8, UInt8)) {
-        let tmp = [Int8(bitPattern: v.0), Int8(bitPattern: v.1), Int8(bitPattern: v.2)]
+        let tmp = [v.0, v.1, v.2]
         try! m.put(indices: indices, data: tmp)
     }
 
     public static func getAt4c(m: Mat, indices:[Int32]) -> (UInt8, UInt8, UInt8, UInt8) {
-        var tmp = [Int8](repeating: 0, count: 4)
+        var tmp = [UInt8](repeating: 0, count: 4)
         try! m.get(indices: indices, data: &tmp)
-        return (UInt8(bitPattern: tmp[0]), UInt8(bitPattern: tmp[1]), UInt8(bitPattern: tmp[2]), UInt8(bitPattern: tmp[3]))
+        return (tmp[0], tmp[1], tmp[2], tmp[3])
     }
 
     public static func putAt4c(m: Mat, indices: [Int32], v: (UInt8, UInt8, UInt8, UInt8)) {
-        let tmp = [Int8(bitPattern: v.0), Int8(bitPattern: v.1), Int8(bitPattern: v.2), Int8(bitPattern: v.3)]
+        let tmp = [v.0, v.1, v.2, v.3]
         try! m.put(indices: indices, data: tmp)
     }
 }
@@ -531,6 +607,52 @@ extension Int32: Atable {
     }
 }
 
+extension UInt16: Atable {
+    public static func getAt(m: Mat, indices:[Int32]) -> UInt16 {
+        var tmp = [UInt16](repeating: 0, count: 1)
+        try! m.get(indices: indices, data: &tmp)
+        return tmp[0]
+    }
+
+    public static func putAt(m: Mat, indices: [Int32], v: UInt16) {
+        let tmp = [v]
+        try! m.put(indices: indices, data: tmp)
+    }
+
+    public static func getAt2c(m: Mat, indices:[Int32]) -> (UInt16, UInt16) {
+        var tmp = [UInt16](repeating: 0, count: 2)
+        try! m.get(indices: indices, data: &tmp)
+        return (tmp[0], tmp[1])
+    }
+
+    public static func putAt2c(m: Mat, indices: [Int32], v: (UInt16, UInt16)) {
+        let tmp = [v.0, v.1]
+        try! m.put(indices: indices, data: tmp)
+    }
+
+    public static func getAt3c(m: Mat, indices:[Int32]) -> (UInt16, UInt16, UInt16) {
+        var tmp = [UInt16](repeating: 0, count: 3)
+        try! m.get(indices: indices, data: &tmp)
+        return (tmp[0], tmp[1], tmp[2])
+    }
+
+    public static func putAt3c(m: Mat, indices: [Int32], v: (UInt16, UInt16, UInt16)) {
+        let tmp = [v.0, v.1, v.2]
+        try! m.put(indices: indices, data: tmp)
+    }
+
+    public static func getAt4c(m: Mat, indices:[Int32]) -> (UInt16, UInt16, UInt16, UInt16) {
+        var tmp = [UInt16](repeating: 0, count: 4)
+        try! m.get(indices: indices, data: &tmp)
+        return (tmp[0], tmp[1], tmp[2], tmp[3])
+    }
+
+    public static func putAt4c(m: Mat, indices: [Int32], v: (UInt16, UInt16, UInt16, UInt16)) {
+        let tmp = [v.0, v.1, v.2, v.3]
+        try! m.put(indices: indices, data: tmp)
+    }
+}
+
 extension Int16: Atable {
     public static func getAt(m: Mat, indices:[Int32]) -> Int16 {
         var tmp = [Int16](repeating: 0, count: 1)
diff --git a/modules/core/misc/objc/test/MatTest.swift b/modules/core/misc/objc/test/MatTest.swift
index 14c440b5eb88..8a513505cc14 100644
--- a/modules/core/misc/objc/test/MatTest.swift
+++ b/modules/core/misc/objc/test/MatTest.swift
@@ -308,15 +308,15 @@ class MatTests: OpenCVTestCase {
         XCTAssert([340] == sm.get(row: 1, col: 1))
     }
 
-    func testGetIntIntByteArray() throws {
-        let m = try getTestMat(size: 5, type: CvType.CV_8UC3)
+    func testGetIntIntInt8Array() throws {
+        let m = try getTestMat(size: 5, type: CvType.CV_8SC3)
         var goodData = [Int8](repeating: 0, count: 9)
 
         // whole Mat
         var bytesNum = try m.get(row: 1, col: 1, data: &goodData)
 
         XCTAssertEqual(9, bytesNum)
-        XCTAssert([110, 111, 112, 120, 121, 122, -126, -125, -124] == goodData)
+        XCTAssert([110, 111, 112, 120, 121, 122, 127, 127, 127] == goodData)
 
         var badData = [Int8](repeating: 0, count: 7)
         XCTAssertThrowsError(bytesNum = try m.get(row: 0, col: 0, data: &badData))
@@ -326,11 +326,36 @@ class MatTests: OpenCVTestCase {
         var buff00 = [Int8](repeating: 0, count: 3)
         bytesNum = try sm.get(row: 0, col: 0, data: &buff00)
         XCTAssertEqual(3, bytesNum)
-        XCTAssert(buff00 == [-26, -25, -24])
+        XCTAssert(buff00 == [127, 127, 127])
         var buff11 = [Int8](repeating: 0, count: 3)
         bytesNum = try sm.get(row: 1, col: 1, data: &buff11)
         XCTAssertEqual(3, bytesNum)
-        XCTAssert(buff11 == [-1, -1, -1])
+        XCTAssert(buff11 == [127, 127, 127])
+    }
+
+    func testGetIntIntUInt8Array() throws {
+        let m = try getTestMat(size: 5, type: CvType.CV_8UC3)
+        var goodData = [UInt8](repeating: 0, count: 9)
+
+        // whole Mat
+        var bytesNum = try m.get(row: 1, col: 1, data: &goodData)
+
+        XCTAssertEqual(9, bytesNum)
+        XCTAssert([110, 111, 112, 120, 121, 122, 130, 131, 132] == goodData)
+
+        var badData = [UInt8](repeating: 0, count: 7)
+        XCTAssertThrowsError(bytesNum = try m.get(row: 0, col: 0, data: &badData))
+
+        // sub-Mat
+        let sm = m.submat(rowStart: 2, rowEnd: 4, colStart: 3, colEnd: 5)
+        var buff00 = [UInt8](repeating: 0, count: 3)
+        bytesNum = try sm.get(row: 0, col: 0, data: &buff00)
+        XCTAssertEqual(3, bytesNum)
+        XCTAssert(buff00 == [230, 231, 232])
+        var buff11 = [UInt8](repeating: 0, count: 3)
+        bytesNum = try sm.get(row: 1, col: 1, data: &buff11)
+        XCTAssertEqual(3, bytesNum)
+        XCTAssert(buff11 == [255, 255, 255])
     }
 
     func testGetIntIntDoubleArray() throws {
@@ -399,7 +424,7 @@ class MatTests: OpenCVTestCase {
         XCTAssert(buff11 == [340, 341, 0, 0])
     }
 
-    func testGetIntIntShortArray() throws {
+    func testGetIntIntInt16Array() throws {
         let m = try getTestMat(size: 5, type: CvType.CV_16SC2)
         var buff = [Int16](repeating: 0, count: 6)
 
@@ -421,6 +446,28 @@ class MatTests: OpenCVTestCase {
         XCTAssert(buff11 == [340, 341, 0, 0])
     }
 
+    func testGetIntIntUInt16Array() throws {
+        let m = try getTestMat(size: 5, type: CvType.CV_16UC2)
+        var buff = [UInt16](repeating: 0, count: 6)
+
+        // whole Mat
+        var bytesNum = try m.get(row: 1, col: 1, data: &buff)
+
+        XCTAssertEqual(12, bytesNum);
+        XCTAssert(buff == [110, 111, 120, 121, 130, 131])
+
+        // sub-Mat
+        let sm = m.submat(rowStart: 2, rowEnd: 4, colStart: 3, colEnd: 5)
+        var buff00 = [UInt16](repeating: 0, count: 4)
+        bytesNum = try sm.get(row: 0, col: 0, data: &buff00)
+        XCTAssertEqual(8, bytesNum)
+        XCTAssert(buff00 == [230, 231, 240, 241])
+        var buff11 = [UInt16](repeating: 0, count: 4)
+        bytesNum = try sm.get(row: 1, col: 1, data: &buff11)
+        XCTAssertEqual(4, bytesNum);
+        XCTAssert(buff11 == [340, 341, 0, 0])
+    }
+
     func testHeight() {
         XCTAssertEqual(gray0.rows(), gray0.height())
         XCTAssertEqual(rgbLena.rows(), rgbLena.height())
@@ -653,7 +700,7 @@ class MatTests: OpenCVTestCase {
         try assertMatEqual(truth!, m1, OpenCVTestCase.EPS)
     }
 
-    func testPutIntIntByteArray() throws {
+    func testPutIntIntInt8Array() throws {
         let m = Mat(rows: 5, cols: 5, type: CvType.CV_8SC3, scalar: Scalar(1, 2, 3))
         let sm = m.submat(rowStart: 2, rowEnd: 4, colStart: 3, colEnd: 5)
         var buff = [Int8](repeating: 0, count: 6)
@@ -683,7 +730,37 @@ class MatTests: OpenCVTestCase {
         XCTAssert(buff == buff0)
     }
 
-    func testPutIntArrayByteArray() throws {
+    func testPutIntIntUInt8Array() throws {
+        let m = Mat(rows: 5, cols: 5, type: CvType.CV_8UC3, scalar: Scalar(1, 2, 3))
+        let sm = m.submat(rowStart: 2, rowEnd: 4, colStart: 3, colEnd: 5)
+        var buff = [UInt8](repeating: 0, count: 6)
+        let buff0:[UInt8] = [10, 20, 30, 40, 50, 60]
+        let buff1:[UInt8] = [255, 254, 253, 252, 251, 250]
+
+        var bytesNum = try m.put(row:1, col:2, data:buff0)
+
+        XCTAssertEqual(6, bytesNum)
+        bytesNum = try m.get(row: 1, col: 2, data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff0)
+
+        bytesNum = try sm.put(row:0, col:0, data:buff1)
+
+        XCTAssertEqual(6, bytesNum)
+        bytesNum = try sm.get(row: 0, col: 0, data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff1)
+        bytesNum = try m.get(row: 2, col: 3, data: &buff)
+        XCTAssertEqual(6, bytesNum);
+        XCTAssert(buff == buff1)
+
+        let m1 = m.row(1)
+        bytesNum = try m1.get(row: 0, col: 2, data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff0)
+    }
+
+    func testPutIntArrayInt8Array() throws {
         let m = Mat(sizes: [5, 5, 5], type: CvType.CV_8SC3, scalar: Scalar(1, 2, 3))
         let sm = m.submat(ranges: [Range(start: 0, end: 2), Range(start: 1, end: 3), Range(start: 2, end: 4)])
         var buff = [Int8](repeating: 0, count: 6)
@@ -714,10 +791,41 @@ class MatTests: OpenCVTestCase {
         XCTAssert(buff == buff0)
     }
 
+    func testPutIntArrayUInt8Array() throws {
+        let m = Mat(sizes: [5, 5, 5], type: CvType.CV_8UC3, scalar: Scalar(1, 2, 3))
+        let sm = m.submat(ranges: [Range(start: 0, end: 2), Range(start: 1, end: 3), Range(start: 2, end: 4)])
+        var buff = [UInt8](repeating: 0, count: 6)
+        let buff0:[UInt8] = [10, 20, 30, 40, 50, 60]
+        let buff1:[UInt8] = [255, 254, 253, 252, 251, 250]
+
+        var bytesNum = try m.put(indices:[1, 2, 0], data:buff0)
+
+        XCTAssertEqual(6, bytesNum)
+        bytesNum = try m.get(indices: [1, 2, 0], data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff0)
+
+        bytesNum = try sm.put(indices: [0, 0, 0], data: buff1)
+
+        XCTAssertEqual(6, bytesNum)
+        bytesNum = try sm.get(indices: [0, 0, 0], data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff1)
+
+        bytesNum = try m.get(indices: [0, 1, 2], data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff1)
+
+        let m1 = m.submat(ranges: [Range(start: 1,end: 2), Range.all(), Range.all()])
+        bytesNum = try m1.get(indices: [0, 2, 0], data: &buff)
+        XCTAssertEqual(6, bytesNum)
+        XCTAssert(buff == buff0)
+    }
+
     func testPutIntIntDoubleArray() throws {
-        let m = Mat(rows: 5, cols: 5, type: CvType.CV_8SC3, scalar: Scalar(1, 2, 3))
+        let m = Mat(rows: 5, cols: 5, type: CvType.CV_8UC3, scalar: Scalar(1, 2, 3))
         let sm = m.submat(rowStart: 2, rowEnd: 4, colStart: 3, colEnd: 5)
-        var buff = [Int8](repeating: 0, count: 6)
+        var buff = [UInt8](repeating: 0, count: 6)
 
         var bytesNum = try m.put(row: 1, col: 2, data: [10, 20, 30, 40, 50, 60] as [Double])
 
@@ -731,16 +839,16 @@ class MatTests: OpenCVTestCase {
         XCTAssertEqual(6, bytesNum)
         bytesNum = try sm.get(row: 0, col: 0, data: &buff)
         XCTAssertEqual(6, bytesNum);
-        XCTAssert(buff == [-1, -2, -3, -4, -5, -6])
+        XCTAssert(buff == [255, 254, 253, 252, 251, 250])
         bytesNum = try m.get(row: 2, col: 3, data: &buff)
         XCTAssertEqual(6, bytesNum);
-        XCTAssert(buff == [-1, -2, -3, -4, -5, -6])
+        XCTAssert(buff == [255, 254, 253, 252, 251, 250])
     }
 
     func testPutIntArrayDoubleArray() throws {
-        let m = Mat(sizes: [5, 5, 5], type: CvType.CV_8SC3, scalar: Scalar(1, 2, 3))
+        let m = Mat(sizes: [5, 5, 5], type: CvType.CV_8UC3, scalar: Scalar(1, 2, 3))
         let sm = m.submat(ranges: [Range(start: 0, end: 2), Range(start: 1, end: 3), Range(start: 2, end: 4)])
-        var buff = [Int8](repeating: 0, count: 6)
+        var buff = [UInt8](repeating: 0, count: 6)
 
         var bytesNum = try m.put(indices: [1, 2, 0], data: [10, 20, 30, 40, 50, 60] as [Double])
 
@@ -754,10 +862,10 @@ class MatTests: OpenCVTestCase {
         XCTAssertEqual(6, bytesNum);
         bytesNum = try sm.get(indices: [0, 0, 0], data: &buff)
         XCTAssertEqual(6, bytesNum);
-        XCTAssert(buff == [-1, -2, -3, -4, -5, -6])
+        XCTAssert(buff == [255, 254, 253, 252, 251, 250])
         bytesNum = try m.get(indices: [0, 1, 2], data: &buff)
         XCTAssertEqual(6, bytesNum)
-        XCTAssert(buff == [-1, -2, -3, -4, -5, -6])
+        XCTAssert(buff == [255, 254, 253, 252, 251, 250])
     }
 
     func testPutIntIntFloatArray() throws {
@@ -820,7 +928,7 @@ class MatTests: OpenCVTestCase {
         XCTAssert([40, 50, 60] == m.get(indices: [0, 1, 0]))
     }
 
-    func testPutIntIntShortArray() throws {
+    func testPutIntIntInt16Array() throws {
         let m = Mat(rows: 5, cols: 5, type: CvType.CV_16SC3, scalar: Scalar(-1, -2, -3))
         let elements: [Int16] = [ 10, 20, 30, 40, 50, 60]
 
@@ -834,7 +942,21 @@ class MatTests: OpenCVTestCase {
         XCTAssert([40, 50, 60] == m.get(row: 2, col: 4))
     }
 
-    func testPutIntArrayShortArray() throws {
+    func testPutIntIntUInt16Array() throws {
+        let m = Mat(rows: 5, cols: 5, type: CvType.CV_16UC3, scalar: Scalar(-1, -2, -3))
+        let elements: [UInt16] = [ 10, 20, 30, 40, 50, 60]
+
+        var bytesNum = try m.put(row: 2, col: 3, data: elements)
+
+        XCTAssertEqual(Int32(elements.count * 2), bytesNum)
+        let m1 = m.col(3)
+        var buff = [UInt16](repeating: 0, count: 3)
+        bytesNum = try m1.get(row: 2, col: 0, data: &buff)
+        XCTAssert(buff == [10, 20, 30])
+        XCTAssert([40, 50, 60] == m.get(row: 2, col: 4))
+    }
+
+    func testPutIntArrayInt16Array() throws {
         let m = Mat(sizes: [5, 5, 5], type: CvType.CV_16SC3, scalar: Scalar(-1, -2, -3))
         let elements: [Int16] = [ 10, 20, 30, 40, 50, 60]
 
@@ -848,6 +970,20 @@ class MatTests: OpenCVTestCase {
         XCTAssert([40, 50, 60] == m.get(indices: [0, 2, 4]))
     }
 
+    func testPutIntArrayUInt16Array() throws {
+        let m = Mat(sizes: [5, 5, 5], type: CvType.CV_16UC3, scalar: Scalar(-1, -2, -3))
+        let elements: [UInt16] = [ 10, 20, 30, 40, 50, 60]
+
+        var bytesNum = try m.put(indices: [0, 2, 3], data: elements)
+
+        XCTAssertEqual(Int32(elements.count * 2), bytesNum)
+        let m1 = m.submat(ranges: [Range.all(), Range.all(), Range(start: 3, end: 4)])
+        var buff = [UInt16](repeating: 0, count: 3)
+        bytesNum = try m1.get(indices: [0, 2, 0], data: &buff)
+        XCTAssert(buff == [10, 20, 30])
+        XCTAssert([40, 50, 60] == m.get(indices: [0, 2, 4]))
+    }
+
     func testReshapeInt() throws {
         let src = Mat(rows: 4, cols: 4, type: CvType.CV_8U, scalar: Scalar(0))
         dst = src.reshape(channels: 4)
diff --git a/modules/core/src/async.cpp b/modules/core/src/async.cpp
index a2f4612365b9..78c0a1ee8116 100644
--- a/modules/core/src/async.cpp
+++ b/modules/core/src/async.cpp
@@ -14,6 +14,7 @@
 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
 #include <opencv2/core/utils/logger.hpp>
 
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
 
 #ifdef CV_CXX11
 #include <mutex>
@@ -236,6 +237,171 @@ struct AsyncArray::Impl
     }
 };
 
+}  // namespace
+
+#else  // OPENCV_DISABLE_THREAD_SUPPORT
+
+namespace cv {
+
+// no threading
+struct AsyncArray::Impl
+{
+    int refcount;
+    void addrefFuture() CV_NOEXCEPT { refcount_future++; refcount++; }
+    void releaseFuture() CV_NOEXCEPT { refcount_future--; if (0 == --refcount) delete this; }
+    int refcount_future;
+    void addrefPromise() CV_NOEXCEPT { refcount_promise++; refcount++; } \
+    void releasePromise() CV_NOEXCEPT { refcount_promise--; if (0 == --refcount) delete this; }
+    int refcount_promise;
+
+    mutable bool has_result; // Mat, UMat or exception
+
+    mutable cv::Ptr<Mat> result_mat;
+    mutable cv::Ptr<UMat> result_umat;
+
+
+    bool has_exception;
+#if CV__EXCEPTION_PTR
+    std::exception_ptr exception;
+#endif
+    cv::Exception cv_exception;
+
+    mutable bool result_is_fetched;
+
+    bool future_is_returned;
+
+    Impl()
+        : refcount(1), refcount_future(0), refcount_promise(1)
+        , has_result(false)
+        , has_exception(false)
+        , result_is_fetched(false)
+        , future_is_returned(false)
+    {
+        // nothing
+    }
+
+    ~Impl()
+    {
+        if (has_result && !result_is_fetched)
+        {
+            CV_LOG_INFO(NULL, "Asynchronous result has not been fetched");
+        }
+    }
+
+    bool get(OutputArray dst, int64 timeoutNs) const
+    {
+        CV_Assert(!result_is_fetched);
+        if (!has_result)
+        {
+            CV_UNUSED(timeoutNs);
+            CV_Error(Error::StsError, "Result is not produced (unable to wait for result in OPENCV_DISABLE_THREAD_SUPPORT mode)");
+        }
+        if (!result_mat.empty())
+        {
+            dst.move(*result_mat.get());
+            result_mat.release();
+            result_is_fetched = true;
+            return true;
+        }
+        if (!result_umat.empty())
+        {
+            dst.move(*result_umat.get());
+            result_umat.release();
+            result_is_fetched = true;
+            return true;
+        }
+#if CV__EXCEPTION_PTR
+        if (has_exception && exception)
+        {
+            result_is_fetched = true;
+            std::rethrow_exception(exception);
+        }
+#endif
+        if (has_exception)
+        {
+            result_is_fetched = true;
+            throw cv_exception;
+        }
+        CV_Error(Error::StsInternal, "AsyncArray: invalid state of 'has_result = true'");
+        return false;
+    }
+
+    bool valid() const CV_NOEXCEPT
+    {
+        if (result_is_fetched)
+            return false;
+        if (refcount_promise == 0 && !has_result)
+            return false;
+        return true;
+    }
+
+    bool wait_for(int64 timeoutNs) const
+    {
+        CV_Assert(valid());
+        if (has_result)
+            return has_result;
+        if (timeoutNs == 0)
+            return has_result;
+        CV_Error(Error::StsError, "Unable to wait in OPENCV_DISABLE_THREAD_SUPPORT mode");
+    }
+
+    AsyncArray getArrayResult()
+    {
+        CV_Assert(refcount_future == 0);
+        AsyncArray result;
+        addrefFuture();
+        result.p = this;
+        future_is_returned = true;
+        return result;
+    }
+
+    void setValue(InputArray value)
+    {
+        if (future_is_returned && refcount_future == 0)
+            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
+        CV_Assert(!has_result);
+        int k = value.kind();
+        if (k == _InputArray::UMAT)
+        {
+            result_umat = makePtr<UMat>();
+            value.copyTo(*result_umat.get());
+        }
+        else
+        {
+            result_mat = makePtr<Mat>();
+            value.copyTo(*result_mat.get());
+        }
+        has_result = true;
+    }
+
+#if CV__EXCEPTION_PTR
+    void setException(std::exception_ptr e)
+    {
+        if (future_is_returned && refcount_future == 0)
+            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
+        CV_Assert(!has_result);
+        has_exception = true;
+        exception = e;
+        has_result = true;
+    }
+#endif
+
+    void setException(const cv::Exception e)
+    {
+        if (future_is_returned && refcount_future == 0)
+            CV_Error(Error::StsError, "Associated AsyncArray has been destroyed");
+        CV_Assert(!has_result);
+        has_exception = true;
+        cv_exception = e;
+        has_result = true;
+    }
+};
+
+}
+
+#endif  // OPENCV_DISABLE_THREAD_SUPPORT
+
+namespace cv {
 
 AsyncArray::AsyncArray() CV_NOEXCEPT
     : p(NULL)
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index 83c8aaeb5705..227c7aaef774 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -229,14 +229,14 @@ void cv::setIdentity( InputOutputArray _m, const Scalar& s )
 
 namespace cv {
 
-UMat UMat::eye(int rows, int cols, int type)
+UMat UMat::eye(int rows, int cols, int type, UMatUsageFlags usageFlags)
 {
-    return UMat::eye(Size(cols, rows), type);
+    return UMat::eye(Size(cols, rows), type, usageFlags);
 }
 
-UMat UMat::eye(Size size, int type)
+UMat UMat::eye(Size size, int type, UMatUsageFlags usageFlags)
 {
-    UMat m(size, type);
+    UMat m(size, type, usageFlags);
     setIdentity(m);
     return m;
 }
diff --git a/modules/core/src/norm.cpp b/modules/core/src/norm.cpp
index bbefefc95d2f..4df25f495722 100644
--- a/modules/core/src/norm.cpp
+++ b/modules/core/src/norm.cpp
@@ -1194,7 +1194,7 @@ double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask
         // special case to handle "integer" overflow in accumulator
         const size_t esz = src1.elemSize();
         const int total = (int)it.size;
-        const int intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
+        const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
         const int blockSize = std::min(total, intSumBlockSize);
         int isum = 0;
         int count = 0;
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 0e97cf52feb3..46185446f726 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1566,6 +1566,7 @@ struct Device::Impl
         version_ = getStrProp(CL_DEVICE_VERSION);
         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
+        halfFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG);
         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
@@ -1678,6 +1679,7 @@ struct Device::Impl
     String version_;
     std::string extensions_;
     int doubleFPConfig_;
+    int halfFPConfig_;
     bool hostUnifiedMemory_;
     int maxComputeUnits_;
     size_t maxWorkGroupSize_;
@@ -1827,11 +1829,7 @@ int Device::singleFPConfig() const
 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
 
 int Device::halfFPConfig() const
-#ifdef CL_VERSION_1_2
-{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
-#else
-{ CV_REQUIRE_OPENCL_1_2_ERROR; }
-#endif
+{ return p ? p->halfFPConfig_ : 0; }
 
 bool Device::endianLittle() const
 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
@@ -6668,6 +6666,10 @@ void convertFromImage(void* cl_mem_image, UMat& dst)
         depth = CV_32F;
         break;
 
+    case CL_HALF_FLOAT:
+        depth = CV_16F;
+        break;
+
     default:
         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
     }
@@ -6676,9 +6678,23 @@ void convertFromImage(void* cl_mem_image, UMat& dst)
     switch (fmt.image_channel_order)
     {
     case CL_R:
+    case CL_A:
+    case CL_INTENSITY:
+    case CL_LUMINANCE:
         type = CV_MAKE_TYPE(depth, 1);
         break;
 
+    case CL_RG:
+    case CL_RA:
+        type = CV_MAKE_TYPE(depth, 2);
+        break;
+
+    // CL_RGB has no mappings to OpenCV types because CL_RGB can only be used with
+    // CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, or CL_UNORM_INT_101010.
+    /*case CL_RGB:
+        type = CV_MAKE_TYPE(depth, 3);
+        break;*/
+
     case CL_RGBA:
     case CL_BGRA:
     case CL_ARGB:
@@ -7068,6 +7084,13 @@ static std::string kerToStr(const Mat & k)
             stream << "DIG(" << data[i] << "f)";
         stream << "DIG(" << data[width] << "f)";
     }
+    else if (depth == CV_16F)
+    {
+        stream.setf(std::ios_base::showpoint);
+        for (int i = 0; i < width; ++i)
+            stream << "DIG(" << (float)data[i] << "h)";
+        stream << "DIG(" << (float)data[width] << "h)";
+    }
     else
     {
         for (int i = 0; i < width; ++i)
@@ -7091,7 +7114,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
 
     typedef std::string (* func_t)(const Mat &);
     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
-                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
+                                    kerToStr<int>, kerToStr<float>, kerToStr<double>, kerToStr<float16_t> };
     const func_t func = funcs[ddepth];
     CV_Assert(func != 0);
 
@@ -7130,14 +7153,14 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
-        d.preferredVectorWidthDouble(), -1 };
+        d.preferredVectorWidthDouble(), d.preferredVectorWidthHalf() };
 
     // if the device says don't use vectors
     if (vectorWidths[0] == 1)
     {
         // it's heuristic
         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
-        vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
+        vectorWidths[CV_16U] = vectorWidths[CV_16S] = vectorWidths[CV_16F] = 2;
         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
     }
 
@@ -7225,10 +7248,12 @@ struct Image2D::Impl
     {
         cl_image_format format;
         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
-                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
+                                       CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, CL_HALF_FLOAT };
         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
-        static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
+        // CL_RGB has no mappings to OpenCV types because CL_RGB can only be used with
+        // CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, or CL_UNORM_INT_101010.
+        static const int channelOrders[] = { -1, CL_R, CL_RG, /*CL_RGB*/ -1, CL_RGBA };
 
         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
         int channelOrder = channelOrders[cn];
diff --git a/modules/core/src/parallel.cpp b/modules/core/src/parallel.cpp
index 7bb7e4633dcd..8fccd19798ae 100644
--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -56,7 +56,7 @@
     #undef abs
 #endif
 
-#if defined __linux__ || defined __APPLE__ || defined __GLIBC__ \
+#if defined __unix__ || defined __APPLE__ || defined __GLIBC__ \
     || defined __HAIKU__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ \
     || defined __OpenBSD__
     #include <unistd.h>
@@ -72,7 +72,7 @@
     #endif
 #endif
 
-#if defined CV_CXX11
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
     #include <thread>
 #endif
 
@@ -884,9 +884,11 @@ T minNonZero(const T& val_1, const T& val_2)
     return (val_1 != 0) ? val_1 : val_2;
 }
 
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
 static
 int getNumberOfCPUs_()
 {
+#ifndef OPENCV_SEMIHOSTING
     /*
      * Logic here is to try different methods of getting CPU counts and return
      * the minimum most value as it has high probablity of being right and safe.
@@ -978,6 +980,9 @@ int getNumberOfCPUs_()
 #endif
 
     return ncpus != 0 ? ncpus : 1;
+#else //  OPENCV_SEMIHOSTING
+    return 1;
+#endif //OPENCV_SEMIHOSTING
 }
 
 int getNumberOfCPUs()
@@ -986,6 +991,13 @@ int getNumberOfCPUs()
     return nCPUs;  // cached value
 }
 
+#else  // OPENCV_DISABLE_THREAD_SUPPORT
+int getNumberOfCPUs()
+{
+    return 1;
+}
+#endif  // OPENCV_DISABLE_THREAD_SUPPORT
+
 const char* currentParallelFramework()
 {
     std::shared_ptr<ParallelForAPI>& api = getCurrentParallelForAPI();
diff --git a/modules/core/src/persistence.cpp b/modules/core/src/persistence.cpp
index 4bf52a3134df..291931b5ae01 100644
--- a/modules/core/src/persistence.cpp
+++ b/modules/core/src/persistence.cpp
@@ -4,6 +4,8 @@
 
 #include "precomp.hpp"
 #include "persistence.hpp"
+#include "persistence_impl.hpp"
+#include "persistence_base64_encoding.hpp"
 #include <unordered_map>
 #include <iterator>
 
@@ -143,17 +145,17 @@ static const char symbols[9] = "ucwsifdh";
 static char typeSymbol(int depth)
 {
     CV_StaticAssert(CV_64F == 6, "");
-    CV_Assert(depth >=0 && depth <= CV_64F);
+    CV_CheckDepth(depth, depth >=0 && depth <= CV_16F, "");
     return symbols[depth];
 }
 
 static int symbolToType(char c)
 {
-    const char* pos = strchr( symbols, c );
-    if( !pos )
-        CV_Error( CV_StsBadArg, "Invalid data type specification" );
     if (c == 'r')
         return CV_SEQ_ELTYPE_PTR;
+    const char* pos = strchr( symbols, c );
+    if( !pos )
+        CV_Error( cv::Error::StsBadArg, "Invalid data type specification" );
     return static_cast<int>(pos - symbols);
 }
 
@@ -192,7 +194,7 @@ int decodeFormat( const char* dt, int* fmt_pairs, int max_len )
             }
 
             if( count <= 0 )
-                CV_Error( CV_StsBadArg, "Invalid data type specification" );
+                CV_Error( cv::Error::StsBadArg, "Invalid data type specification" );
 
             fmt_pairs[i] = count;
         }
@@ -208,7 +210,7 @@ int decodeFormat( const char* dt, int* fmt_pairs, int max_len )
             {
                 i += 2;
                 if( i >= max_len )
-                    CV_Error( CV_StsBadArg, "Too long data type specification" );
+                    CV_Error( cv::Error::StsBadArg, "Too long data type specification" );
             }
             fmt_pairs[i] = 0;
         }
@@ -245,8 +247,12 @@ int calcStructSize( const char* dt, int initial_size )
 {
     int size = calcElemSize( dt, initial_size );
     size_t elem_max_size = 0;
-    for ( const char * type = dt; *type != '\0'; type++ ) {
-        switch ( *type )
+    for ( const char * type = dt; *type != '\0'; type++ )
+    {
+        char v = *type;
+        if (v >= '0' && v <= '9')
+            continue;  // skip vector size
+        switch (v)
         {
         case 'u': { elem_max_size = std::max( elem_max_size, sizeof(uchar ) ); break; }
         case 'c': { elem_max_size = std::max( elem_max_size, sizeof(schar ) ); break; }
@@ -255,7 +261,9 @@ int calcStructSize( const char* dt, int initial_size )
         case 'i': { elem_max_size = std::max( elem_max_size, sizeof(int   ) ); break; }
         case 'f': { elem_max_size = std::max( elem_max_size, sizeof(float ) ); break; }
         case 'd': { elem_max_size = std::max( elem_max_size, sizeof(double) ); break; }
-        default: break;
+        case 'h': { elem_max_size = std::max(elem_max_size, sizeof(float16_t)); break; }
+        default:
+            CV_Error_(Error::StsNotImplemented, ("Unknown type identifier: '%c' in '%s'", (char)(*type), dt));
         }
     }
     size = cvAlign( size, static_cast<int>(elem_max_size) );
@@ -269,7 +277,7 @@ int decodeSimpleFormat( const char* dt )
 
     fmt_pair_count = decodeFormat( dt, fmt_pairs, CV_FS_MAX_FMT_PAIRS );
     if( fmt_pair_count != 1 || fmt_pairs[0] >= CV_CN_MAX)
-        CV_Error( CV_StsError, "Too complex format for the matrix" );
+        CV_Error( cv::Error::StsError, "Too complex format for the matrix" );
 
     elem_type = CV_MAKETYPE( fmt_pairs[1], fmt_pairs[0] );
 
@@ -339,1449 +347,1483 @@ static inline void writeReal(uchar* p, double fval)
 #endif
 }
 
-class FileStorage::Impl : public FileStorage_API
-{
-public:
-    void init()
-    {
-        flags = 0;
-        buffer.clear();
-        bufofs = 0;
-        state = UNDEFINED;
-        is_opened = false;
-        dummy_eof = false;
-        write_mode = false;
-        mem_mode = false;
-        space = 0;
-        wrap_margin = 71;
-        fmt = 0;
-        file = 0;
-        gzfile = 0;
-        empty_stream = true;
-
-        strbufv.clear();
-        strbuf = 0;
-        strbufsize = strbufpos = 0;
-        roots.clear();
-
-        fs_data.clear();
-        fs_data_ptrs.clear();
-        fs_data_blksz.clear();
-        freeSpaceOfs = 0;
-
-        str_hash.clear();
-        str_hash_data.clear();
-        str_hash_data.resize(1);
-        str_hash_data[0] = '\0';
-
-        filename.clear();
-        lineno = 0;
-    }
-
-    Impl(FileStorage* _fs)
-    {
-        fs_ext = _fs;
-        init();
-    }
 
-    virtual ~Impl()
-    {
-        release();
-    }
 
-    void release(String* out=0)
-    {
-        if( is_opened )
-        {
-            if(out)
-                out->clear();
-            if( write_mode )
-            {
-                while( write_stack.size() > 1 )
-                {
-                    endWriteStruct();
-                }
-                flush();
-                if( fmt == FileStorage::FORMAT_XML )
-                    puts( "</opencv_storage>\n" );
-                else if ( fmt == FileStorage::FORMAT_JSON )
-                    puts( "}\n" );
-            }
-            if( mem_mode && out )
-            {
-                *out = cv::String(outbuf.begin(), outbuf.end());
+void FileStorage::Impl::init() {
+    flags = 0;
+    buffer.clear();
+    bufofs = 0;
+    state = UNDEFINED;
+    is_using_base64 = false;
+    state_of_writing_base64 = FileStorage_API::Base64State::Uncertain;
+    is_write_struct_delayed = false;
+    delayed_struct_key = nullptr;
+    delayed_struct_flags = 0;
+    delayed_type_name = nullptr;
+    base64_writer = nullptr;
+    is_opened = false;
+    dummy_eof = false;
+    write_mode = false;
+    mem_mode = false;
+    space = 0;
+    wrap_margin = 71;
+    fmt = 0;
+    file = 0;
+    gzfile = 0;
+    empty_stream = true;
+
+    strbufv.clear();
+    strbuf = 0;
+    strbufsize = strbufpos = 0;
+    roots.clear();
+
+    fs_data.clear();
+    fs_data_ptrs.clear();
+    fs_data_blksz.clear();
+    freeSpaceOfs = 0;
+
+    str_hash.clear();
+    str_hash_data.clear();
+    str_hash_data.resize(1);
+    str_hash_data[0] = '\0';
+
+    filename.clear();
+    lineno = 0;
+}
+
+FileStorage::Impl::Impl(FileStorage *_fs) {
+    fs_ext = _fs;
+    init();
+}
+
+FileStorage::Impl::~Impl() {
+    release();
+}
+
+void FileStorage::Impl::release(String *out) {
+    if (is_opened) {
+        if (out)
+            out->clear();
+        if (write_mode) {
+            while (write_stack.size() > 1) {
+                endWriteStruct();
             }
+            flush();
+            if (fmt == FileStorage::FORMAT_XML)
+                puts("</opencv_storage>\n");
+            else if (fmt == FileStorage::FORMAT_JSON)
+                puts("}\n");
+        }
+        if (mem_mode && out) {
+            *out = cv::String(outbuf.begin(), outbuf.end());
         }
-        closeFile();
-        init();
     }
+    closeFile();
+    init();
+}
 
-    void analyze_file_name( const std::string& file_name, std::vector<std::string>& params )
-    {
-        params.clear();
-        static const char not_file_name       = '\n';
-        static const char parameter_begin     = '?';
-        static const char parameter_separator = '&';
+void FileStorage::Impl::analyze_file_name(const std::string &file_name, std::vector<std::string> &params) {
+    params.clear();
+    static const char not_file_name = '\n';
+    static const char parameter_begin = '?';
+    static const char parameter_separator = '&';
 
-        if( file_name.find(not_file_name, (size_t)0) != std::string::npos )
-            return;
+    if (file_name.find(not_file_name, (size_t) 0) != std::string::npos)
+        return;
 
-        size_t beg = file_name.find_last_of(parameter_begin);
-        params.push_back(file_name.substr((size_t)0, beg));
+    size_t beg = file_name.find_last_of(parameter_begin);
+    params.push_back(file_name.substr((size_t) 0, beg));
 
-        if( beg != std::string::npos )
-        {
-            size_t end = file_name.size();
-            beg++;
-            for( size_t param_beg = beg, param_end = beg;
-                 param_end < end;
-                 param_beg = param_end + 1 )
-            {
-                param_end = file_name.find_first_of( parameter_separator, param_beg );
-                if( (param_end == std::string::npos || param_end != param_beg) && param_beg + 1 < end )
-                {
-                    params.push_back( file_name.substr( param_beg, param_end - param_beg ) );
-                }
+    if (beg != std::string::npos) {
+        size_t end = file_name.size();
+        beg++;
+        for (size_t param_beg = beg, param_end = beg;
+             param_end < end;
+             param_beg = param_end + 1) {
+            param_end = file_name.find_first_of(parameter_separator, param_beg);
+            if ((param_end == std::string::npos || param_end != param_beg) && param_beg + 1 < end) {
+                params.push_back(file_name.substr(param_beg, param_end - param_beg));
             }
         }
     }
+}
 
-    bool open( const char* filename_or_buf, int _flags, const char* encoding )
-    {
-        _flags &= ~FileStorage::BASE64;
-
-        bool ok = true;
-        release();
+bool FileStorage::Impl::open(const char *filename_or_buf, int _flags, const char *encoding) {
+    bool ok = true;
+    release();
 
-        bool append = (_flags & 3) == FileStorage::APPEND;
-        mem_mode = (_flags & FileStorage::MEMORY) != 0;
+    bool append = (_flags & 3) == FileStorage::APPEND;
+    mem_mode = (_flags & FileStorage::MEMORY) != 0;
 
-        write_mode = (_flags & 3) != 0;
+    write_mode = (_flags & 3) != 0;
+    bool write_base64 = (write_mode || append) && (_flags & FileStorage::BASE64) != 0;
 
-        bool isGZ = false;
-        size_t fnamelen = 0;
+    bool isGZ = false;
+    size_t fnamelen = 0;
 
-        std::vector<std::string> params;
-        //if ( !mem_mode )
-        {
-            analyze_file_name( filename_or_buf, params );
-            if( !params.empty() )
-                filename = params[0];
+    std::vector<std::string> params;
+    //if ( !mem_mode )
+    {
+        analyze_file_name(filename_or_buf, params);
+        if (!params.empty())
+            filename = params[0];
 
-            /*if( !write_base64 && params.size() >= 2 &&
-                std::find(params.begin()+1, params.end(), std::string("base64")) != params.end())
-                write_base64 = (write_mode || append);*/
-        }
+        if (!write_base64 && params.size() >= 2 &&
+            std::find(params.begin() + 1, params.end(), std::string("base64")) != params.end())
+            write_base64 = (write_mode || append);
+    }
 
-        if( filename.size() == 0 && !mem_mode && !write_mode )
-            CV_Error( CV_StsNullPtr, "NULL or empty filename" );
+    if (filename.size() == 0 && !mem_mode && !write_mode)
+        CV_Error(cv::Error::StsNullPtr, "NULL or empty filename");
 
-        if( mem_mode && append )
-            CV_Error( CV_StsBadFlag, "FileStorage::APPEND and FileStorage::MEMORY are not currently compatible" );
+    if (mem_mode && append)
+        CV_Error(cv::Error::StsBadFlag, "FileStorage::APPEND and FileStorage::MEMORY are not currently compatible");
 
-        flags = _flags;
+    flags = _flags;
 
-        if( !mem_mode )
-        {
-            char* dot_pos = strrchr((char*)filename.c_str(), '.');
-            char compression = '\0';
+    if (!mem_mode) {
+        char *dot_pos = strrchr((char *) filename.c_str(), '.');
+        char compression = '\0';
 
-            if( dot_pos && dot_pos[1] == 'g' && dot_pos[2] == 'z' &&
-               (dot_pos[3] == '\0' || (cv_isdigit(dot_pos[3]) && dot_pos[4] == '\0')) )
-            {
-                if( append )
-                {
-                    CV_Error(CV_StsNotImplemented, "Appending data to compressed file is not implemented" );
-                }
-                isGZ = true;
-                compression = dot_pos[3];
-                if( compression )
-                    dot_pos[3] = '\0', fnamelen--;
+        if (dot_pos && dot_pos[1] == 'g' && dot_pos[2] == 'z' &&
+            (dot_pos[3] == '\0' || (cv_isdigit(dot_pos[3]) && dot_pos[4] == '\0'))) {
+            if (append) {
+                CV_Error(cv::Error::StsNotImplemented, "Appending data to compressed file is not implemented");
             }
+            isGZ = true;
+            compression = dot_pos[3];
+            if (compression)
+                dot_pos[3] = '\0', fnamelen--;
+        }
 
-            if( !isGZ )
-            {
-                file = fopen(filename.c_str(), !write_mode ? "rt" : !append ? "wt" : "a+t" );
-                if( !file )
-                    return false;
-            }
-            else
-            {
+        if (!isGZ) {
+            file = fopen(filename.c_str(), !write_mode ? "rt" : !append ? "wt" : "a+t");
+            if (!file)
+                return false;
+        } else {
 #if USE_ZLIB
-                char mode[] = { write_mode ? 'w' : 'r', 'b', compression ? compression : '3', '\0' };
-                gzfile = gzopen(filename.c_str(), mode);
-                if( !gzfile )
-                    return false;
+            char mode[] = {write_mode ? 'w' : 'r', 'b', compression ? compression : '3', '\0'};
+            gzfile = gzopen(filename.c_str(), mode);
+            if (!gzfile)
+                return false;
 #else
-                CV_Error(CV_StsNotImplemented, "There is no compressed file storage support in this configuration");
+            CV_Error(cv::Error::StsNotImplemented, "There is no compressed file storage support in this configuration");
 #endif
-            }
         }
+    }
 
-        roots.clear();
-        fs_data.clear();
-        wrap_margin = 71;
-        fmt = FileStorage::FORMAT_AUTO;
+    roots.clear();
+    fs_data.clear();
+    wrap_margin = 71;
+    fmt = FileStorage::FORMAT_AUTO;
 
-        if( write_mode )
-        {
-            fmt = flags & FileStorage::FORMAT_MASK;
+    if (write_mode) {
+        fmt = flags & FileStorage::FORMAT_MASK;
 
-            if( mem_mode )
-                outbuf.clear();
+        if (mem_mode)
+            outbuf.clear();
 
-            if( fmt == FileStorage::FORMAT_AUTO && !filename.empty() )
-            {
-                const char* dot_pos = NULL;
-                const char* dot_pos2 = NULL;
-                // like strrchr() implementation, but save two last positions simultaneously
-                for (const char* pos = &filename[0]; pos[0] != 0; pos++)
-                {
-                    if( pos[0] == '.' )
-                    {
-                        dot_pos2 = dot_pos;
-                        dot_pos = pos;
-                    }
-                }
-                if (fs::strcasecmp(dot_pos, ".gz") == 0 && dot_pos2 != NULL)
-                {
-                    dot_pos = dot_pos2;
+        if (fmt == FileStorage::FORMAT_AUTO && !filename.empty()) {
+            const char *dot_pos = NULL;
+            const char *dot_pos2 = NULL;
+            // like strrchr() implementation, but save two last positions simultaneously
+            for (const char *pos = &filename[0]; pos[0] != 0; pos++) {
+                if (pos[0] == '.') {
+                    dot_pos2 = dot_pos;
+                    dot_pos = pos;
                 }
-                fmt = (fs::strcasecmp(dot_pos, ".xml") == 0 || fs::strcasecmp(dot_pos, ".xml.gz") == 0 )
-                        ? FileStorage::FORMAT_XML
-                    : (fs::strcasecmp(dot_pos, ".json") == 0 || fs::strcasecmp(dot_pos, ".json.gz") == 0)
-                        ? FileStorage::FORMAT_JSON
-                    : FileStorage::FORMAT_YAML;
             }
-            else if( fmt == FileStorage::FORMAT_AUTO )
-            {
-                fmt = FileStorage::FORMAT_XML;
-            }
-
-            // we use factor=6 for XML (the longest characters (' and ") are encoded with 6 bytes (&apos; and &quot;)
-            // and factor=4 for YAML ( as we use 4 bytes for non ASCII characters (e.g. \xAB))
-            int buf_size = CV_FS_MAX_LEN*(fmt == FileStorage::FORMAT_XML ? 6 : 4) + 1024;
-
-            if (append)
-            {
-                fseek( file, 0, SEEK_END );
-                if (ftell(file) == 0)
-                    append = false;
+            if (fs::strcasecmp(dot_pos, ".gz") == 0 && dot_pos2 != NULL) {
+                dot_pos = dot_pos2;
             }
+            fmt = (fs::strcasecmp(dot_pos, ".xml") == 0 || fs::strcasecmp(dot_pos, ".xml.gz") == 0)
+                  ? FileStorage::FORMAT_XML
+                  : (fs::strcasecmp(dot_pos, ".json") == 0 || fs::strcasecmp(dot_pos, ".json.gz") == 0)
+                    ? FileStorage::FORMAT_JSON
+                    : FileStorage::FORMAT_YAML;
+        } else if (fmt == FileStorage::FORMAT_AUTO) {
+            fmt = FileStorage::FORMAT_XML;
+        }
 
-            write_stack.clear();
-            empty_stream = true;
-            write_stack.push_back(FStructData("", FileNode::MAP | FileNode::EMPTY, 0));
-            buffer.reserve(buf_size + 1024);
-            buffer.resize(buf_size);
-            bufofs = 0;
+        // we use factor=6 for XML (the longest characters (' and ") are encoded with 6 bytes (&apos; and &quot;)
+        // and factor=4 for YAML ( as we use 4 bytes for non ASCII characters (e.g. \xAB))
+        int buf_size = CV_FS_MAX_LEN * (fmt == FileStorage::FORMAT_XML ? 6 : 4) + 1024;
 
-            if( fmt == FileStorage::FORMAT_XML )
-            {
-                size_t file_size = file ? (size_t)ftell(file) : (size_t)0;
-                if( !append || file_size == 0 )
-                {
-                    if( encoding && *encoding != '\0' )
-                    {
-                        if( fs::strcasecmp(encoding, "UTF-16" ) == 0 )
-                        {
-                            release();
-                            CV_Error( CV_StsBadArg, "UTF-16 XML encoding is not supported! Use 8-bit encoding\n");
-                        }
+        if (append) {
+            fseek(file, 0, SEEK_END);
+            if (ftell(file) == 0)
+                append = false;
+        }
 
-                        CV_Assert( strlen(encoding) < 1000 );
-                        char buf[1100];
-                        sprintf(buf, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding);
-                        puts( buf );
+        write_stack.clear();
+        empty_stream = true;
+        write_stack.push_back(FStructData("", FileNode::MAP | FileNode::EMPTY, 0));
+        buffer.reserve(buf_size + 1024);
+        buffer.resize(buf_size);
+        bufofs = 0;
+        is_using_base64 = write_base64;
+        state_of_writing_base64 = FileStorage_API::Base64State::Uncertain;
+
+        if (fmt == FileStorage::FORMAT_XML) {
+            size_t file_size = file ? (size_t) ftell(file) : (size_t) 0;
+            if (!append || file_size == 0) {
+                if (encoding && *encoding != '\0') {
+                    if (fs::strcasecmp(encoding, "UTF-16") == 0) {
+                        release();
+                        CV_Error(cv::Error::StsBadArg, "UTF-16 XML encoding is not supported! Use 8-bit encoding\n");
                     }
-                    else
-                        puts( "<?xml version=\"1.0\"?>\n" );
-                    puts( "<opencv_storage>\n" );
-                }
-                else
-                {
-                    int xml_buf_size = 1 << 10;
-                    char substr[] = "</opencv_storage>";
-                    int last_occurrence = -1;
-                    xml_buf_size = MIN(xml_buf_size, int(file_size));
-                    fseek( file, -xml_buf_size, SEEK_END );
-                    // find the last occurrence of </opencv_storage>
-                    for(;;)
-                    {
-                        int line_offset = (int)ftell( file );
-                        const char* ptr0 = this->gets(xml_buf_size);
-                        const char* ptr = NULL;
-                        if( !ptr0 )
+
+                    CV_Assert(strlen(encoding) < 1000);
+                    char buf[1100];
+                    sprintf(buf, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding);
+                    puts(buf);
+                } else
+                    puts("<?xml version=\"1.0\"?>\n");
+                puts("<opencv_storage>\n");
+            } else {
+                int xml_buf_size = 1 << 10;
+                char substr[] = "</opencv_storage>";
+                int last_occurrence = -1;
+                xml_buf_size = MIN(xml_buf_size, int(file_size));
+                fseek(file, -xml_buf_size, SEEK_END);
+                // find the last occurrence of </opencv_storage>
+                for (;;) {
+                    int line_offset = (int) ftell(file);
+                    const char *ptr0 = this->gets(xml_buf_size);
+                    const char *ptr = NULL;
+                    if (!ptr0)
+                        break;
+                    ptr = ptr0;
+                    for (;;) {
+                        ptr = strstr(ptr, substr);
+                        if (!ptr)
                             break;
-                        ptr = ptr0;
-                        for(;;)
-                        {
-                            ptr = strstr( ptr, substr );
-                            if( !ptr )
-                                break;
-                            last_occurrence = line_offset + (int)(ptr - ptr0);
-                            ptr += strlen(substr);
-                        }
+                        last_occurrence = line_offset + (int) (ptr - ptr0);
+                        ptr += strlen(substr);
                     }
-                    if( last_occurrence < 0 )
-                    {
-                        release();
-                        CV_Error( CV_StsError, "Could not find </opencv_storage> in the end of file.\n" );
-                    }
-                    closeFile();
-                    file = fopen( filename.c_str(), "r+t" );
-                    CV_Assert(file != 0);
-                    fseek( file, last_occurrence, SEEK_SET );
-                    // replace the last "</opencv_storage>" with " <!-- resumed -->", which has the same length
-                    puts( " <!-- resumed -->" );
-                    fseek( file, 0, SEEK_END );
-                    puts( "\n" );
                 }
-
-                emitter = createXMLEmitter(this);
+                if (last_occurrence < 0) {
+                    release();
+                    CV_Error(cv::Error::StsError, "Could not find </opencv_storage> in the end of file.\n");
+                }
+                closeFile();
+                file = fopen(filename.c_str(), "r+t");
+                CV_Assert(file != 0);
+                fseek(file, last_occurrence, SEEK_SET);
+                // replace the last "</opencv_storage>" with " <!-- resumed -->", which has the same length
+                puts(" <!-- resumed -->");
+                fseek(file, 0, SEEK_END);
+                puts("\n");
             }
-            else if( fmt == FileStorage::FORMAT_YAML )
-            {
-                if( !append)
-                    puts( "%YAML:1.0\n---\n" );
-                else
-                    puts( "...\n---\n" );
 
-                emitter = createYAMLEmitter(this);
-            }
+            emitter = createXMLEmitter(this);
+        } else if (fmt == FileStorage::FORMAT_YAML) {
+            if (!append)
+                puts("%YAML:1.0\n---\n");
             else
-            {
-                CV_Assert( fmt == FileStorage::FORMAT_JSON );
-                if( !append )
-                    puts( "{\n" );
-                else
-                {
-                    bool valid = false;
-                    long roffset = 0;
-                    for ( ;
-                         fseek( file, roffset, SEEK_END ) == 0;
-                         roffset -= 1 )
-                    {
-                        const char end_mark = '}';
-                        if ( fgetc( file ) == end_mark )
-                        {
-                            fseek( file, roffset, SEEK_END );
-                            valid = true;
-                            break;
-                        }
+                puts("...\n---\n");
+
+            emitter = createYAMLEmitter(this);
+        } else {
+            CV_Assert(fmt == FileStorage::FORMAT_JSON);
+            if (!append)
+                puts("{\n");
+            else {
+                bool valid = false;
+                long roffset = 0;
+                for (;
+                        fseek(file, roffset, SEEK_END) == 0;
+                        roffset -= 1) {
+                    const char end_mark = '}';
+                    if (fgetc(file) == end_mark) {
+                        fseek(file, roffset, SEEK_END);
+                        valid = true;
+                        break;
                     }
+                }
 
-                    if ( valid )
-                    {
-                        closeFile();
-                        file = fopen( filename.c_str(), "r+t" );
-                        CV_Assert(file != 0);
-                        fseek( file, roffset, SEEK_END );
-                        fputs( ",", file );
-                    }
-                    else
-                    {
-                        CV_Error( CV_StsError, "Could not find '}' in the end of file.\n" );
-                    }
+                if (valid) {
+                    closeFile();
+                    file = fopen(filename.c_str(), "r+t");
+                    CV_Assert(file != 0);
+                    fseek(file, roffset, SEEK_END);
+                    fputs(",", file);
+                } else {
+                    CV_Error(cv::Error::StsError, "Could not find '}' in the end of file.\n");
                 }
-                write_stack.back().indent = 4;
-                emitter = createJSONEmitter(this);
             }
-            is_opened = true;
+            write_stack.back().indent = 4;
+            emitter = createJSONEmitter(this);
+        }
+        is_opened = true;
+    } else {
+        const size_t buf_size0 = 40;
+        buffer.resize(buf_size0);
+        if (mem_mode) {
+            strbuf = (char *) filename_or_buf;
+            strbufsize = strlen(strbuf);
         }
-        else
-        {
-            const size_t buf_size0 = 40;
-            buffer.resize(buf_size0);
-            if( mem_mode )
-            {
-                strbuf = (char*)filename_or_buf;
-                strbufsize = strlen(strbuf);
-            }
 
-            const char* yaml_signature = "%YAML";
-            const char* json_signature = "{";
-            const char* xml_signature  = "<?xml";
-            char* buf = this->gets(16);
-            CV_Assert(buf);
-            char* bufPtr = cv_skip_BOM(buf);
-            size_t bufOffset = bufPtr - buf;
-
-            if(strncmp( bufPtr, yaml_signature, strlen(yaml_signature) ) == 0)
-                fmt = FileStorage::FORMAT_YAML;
-            else if(strncmp( bufPtr, json_signature, strlen(json_signature) ) == 0)
-                fmt = FileStorage::FORMAT_JSON;
-            else if(strncmp( bufPtr, xml_signature, strlen(xml_signature) ) == 0)
-                fmt = FileStorage::FORMAT_XML;
-            else if(strbufsize  == bufOffset)
-                CV_Error(CV_BADARG_ERR, "Input file is invalid");
-            else
-                CV_Error(CV_BADARG_ERR, "Unsupported file storage format");
+        const char *yaml_signature = "%YAML";
+        const char *json_signature = "{";
+        const char *xml_signature = "<?xml";
+        char *buf = this->gets(16);
+        CV_Assert(buf);
+        char *bufPtr = cv_skip_BOM(buf);
+        size_t bufOffset = bufPtr - buf;
+
+        if (strncmp(bufPtr, yaml_signature, strlen(yaml_signature)) == 0)
+            fmt = FileStorage::FORMAT_YAML;
+        else if (strncmp(bufPtr, json_signature, strlen(json_signature)) == 0)
+            fmt = FileStorage::FORMAT_JSON;
+        else if (strncmp(bufPtr, xml_signature, strlen(xml_signature)) == 0)
+            fmt = FileStorage::FORMAT_XML;
+        else if (strbufsize == bufOffset)
+            CV_Error(cv::Error::StsBadArg, "Input file is invalid");
+        else
+            CV_Error(cv::Error::StsBadArg, "Unsupported file storage format");
 
-            rewind();
-            strbufpos = bufOffset;
-            bufofs = 0;
+        rewind();
+        strbufpos = bufOffset;
+        bufofs = 0;
 
-            try
-            {
-                char* ptr = bufferStart();
-                ptr[0] = ptr[1] = ptr[2] = '\0';
-                FileNode root_nodes(fs_ext, 0, 0);
+        try {
+            char *ptr = bufferStart();
+            ptr[0] = ptr[1] = ptr[2] = '\0';
+            FileNode root_nodes(fs_ext, 0, 0);
 
-                uchar* rptr = reserveNodeSpace(root_nodes, 9);
-                *rptr = FileNode::SEQ;
-                writeInt(rptr + 1, 4);
-                writeInt(rptr + 5, 0);
+            uchar *rptr = reserveNodeSpace(root_nodes, 9);
+            *rptr = FileNode::SEQ;
+            writeInt(rptr + 1, 4);
+            writeInt(rptr + 5, 0);
 
-                roots.clear();
+            roots.clear();
 
-                switch (fmt)
-                {
-                    case FileStorage::FORMAT_XML: parser = createXMLParser(this); break;
-                    case FileStorage::FORMAT_YAML: parser = createYAMLParser(this); break;
-                    case FileStorage::FORMAT_JSON: parser = createJSONParser(this); break;
-                    default: parser = Ptr<FileStorageParser>();
-                }
+            switch (fmt) {
+                case FileStorage::FORMAT_XML:
+                    parser = createXMLParser(this);
+                    break;
+                case FileStorage::FORMAT_YAML:
+                    parser = createYAMLParser(this);
+                    break;
+                case FileStorage::FORMAT_JSON:
+                    parser = createJSONParser(this);
+                    break;
+                default:
+                    parser = Ptr<FileStorageParser>();
+            }
 
-                if( !parser.empty() )
-                {
-                    ok = parser->parse(ptr);
-                    if( ok )
-                    {
-                        finalizeCollection(root_nodes);
+            if (!parser.empty()) {
+                ok = parser->parse(ptr);
+                if (ok) {
+                    finalizeCollection(root_nodes);
 
-                        CV_Assert( !fs_data_ptrs.empty() );
-                        FileNode roots_node(fs_ext, 0, 0);
-                        size_t i, nroots = roots_node.size();
-                        FileNodeIterator it = roots_node.begin();
+                    CV_Assert(!fs_data_ptrs.empty());
+                    FileNode roots_node(fs_ext, 0, 0);
+                    size_t i, nroots = roots_node.size();
+                    FileNodeIterator it = roots_node.begin();
 
-                        for( i = 0; i < nroots; i++, ++it )
-                            roots.push_back(*it);
-                    }
+                    for (i = 0; i < nroots; i++, ++it)
+                        roots.push_back(*it);
                 }
             }
-            catch(...)
-            {
-                is_opened = true;
-                release();
-                throw;
-            }
-
-            // release resources that we do not need anymore
-            closeFile();
+        }
+        catch (...) {
             is_opened = true;
-            std::vector<char> tmpbuf;
-            std::swap(buffer, tmpbuf);
-            bufofs = 0;
+            release();
+            throw;
         }
-        return ok;
+
+        // release resources that we do not need anymore
+        closeFile();
+        is_opened = true;
+        std::vector<char> tmpbuf;
+        std::swap(buffer, tmpbuf);
+        bufofs = 0;
     }
+    return ok;
+}
 
-    void puts( const char* str )
-    {
-        CV_Assert( write_mode );
-        if( mem_mode )
-            std::copy(str, str + strlen(str), std::back_inserter(outbuf));
-        else if( file )
-            fputs( str, file );
+void FileStorage::Impl::puts(const char *str) {
+    CV_Assert(write_mode);
+    if (mem_mode)
+        std::copy(str, str + strlen(str), std::back_inserter(outbuf));
+    else if (file)
+        fputs(str, file);
 #if USE_ZLIB
-        else if( gzfile )
-            gzputs( gzfile, str );
+    else if (gzfile)
+        gzputs(gzfile, str);
 #endif
-        else
-            CV_Error( CV_StsError, "The storage is not opened" );
-    }
-
-    char* getsFromFile( char* buf, int count )
-    {
-        if( file )
-            return fgets( buf, count, file );
-    #if USE_ZLIB
-        if( gzfile )
-            return gzgets( gzfile, buf, count );
-    #endif
-        CV_Error(CV_StsError, "The storage is not opened");
-    }
+    else
+        CV_Error(cv::Error::StsError, "The storage is not opened");
+}
 
-    char* gets( size_t maxCount )
-    {
-        if( strbuf )
-        {
-            size_t i = strbufpos, len = strbufsize;
-            const char* instr = strbuf;
-            for( ; i < len; i++ )
-            {
-                char c = instr[i];
-                if( c == '\0' || c == '\n' )
-                {
-                    if( c == '\n' )
-                        i++;
-                    break;
-                }
+char *FileStorage::Impl::getsFromFile(char *buf, int count) {
+    if (file)
+        return fgets(buf, count, file);
+#if USE_ZLIB
+    if (gzfile)
+        return gzgets(gzfile, buf, count);
+#endif
+    CV_Error(cv::Error::StsError, "The storage is not opened");
+}
+
+char *FileStorage::Impl::gets(size_t maxCount) {
+    if (strbuf) {
+        size_t i = strbufpos, len = strbufsize;
+        const char *instr = strbuf;
+        for (; i < len; i++) {
+            char c = instr[i];
+            if (c == '\0' || c == '\n') {
+                if (c == '\n')
+                    i++;
+                break;
             }
-            size_t count = i - strbufpos;
-            if( maxCount == 0 || maxCount > count )
-                maxCount = count;
-            buffer.resize(std::max(buffer.size(), maxCount + 8));
-            memcpy(&buffer[0], instr + strbufpos, maxCount);
-            buffer[maxCount] = '\0';
-            strbufpos = i;
-            return maxCount > 0 ? &buffer[0] : 0;
         }
+        size_t count = i - strbufpos;
+        if (maxCount == 0 || maxCount > count)
+            maxCount = count;
+        buffer.resize(std::max(buffer.size(), maxCount + 8));
+        memcpy(&buffer[0], instr + strbufpos, maxCount);
+        buffer[maxCount] = '\0';
+        strbufpos = i;
+        return maxCount > 0 ? &buffer[0] : 0;
+    }
+
+    const size_t MAX_BLOCK_SIZE = INT_MAX / 2; // hopefully, that will be enough
+    if (maxCount == 0)
+        maxCount = MAX_BLOCK_SIZE;
+    else
+        CV_Assert(maxCount < MAX_BLOCK_SIZE);
+    size_t ofs = 0;
 
-        const size_t MAX_BLOCK_SIZE = INT_MAX/2; // hopefully, that will be enough
-        if( maxCount == 0 )
-            maxCount = MAX_BLOCK_SIZE;
-        else
-            CV_Assert(maxCount < MAX_BLOCK_SIZE);
-        size_t ofs = 0;
-
-        for(;;)
-        {
-            int count = (int)std::min(buffer.size() - ofs - 16, maxCount);
-            char* ptr = getsFromFile( &buffer[ofs], count+1 );
-            if( !ptr )
-                break;
-            int delta = (int)strlen(ptr);
-            ofs += delta;
-            maxCount -= delta;
-            if( ptr[delta-1] == '\n' || maxCount == 0 )
-                break;
-            if( delta == count )
-                buffer.resize((size_t)(buffer.size()*1.5));
-        }
-        return ofs > 0 ? &buffer[0] : 0;
+    for (;;) {
+        int count = (int) std::min(buffer.size() - ofs - 16, maxCount);
+        char *ptr = getsFromFile(&buffer[ofs], count + 1);
+        if (!ptr)
+            break;
+        int delta = (int) strlen(ptr);
+        ofs += delta;
+        maxCount -= delta;
+        if (ptr[delta - 1] == '\n' || maxCount == 0)
+            break;
+        if (delta == count)
+            buffer.resize((size_t) (buffer.size() * 1.5));
     }
+    return ofs > 0 ? &buffer[0] : 0;
+}
 
-    char* gets()
-    {
-        char* ptr = this->gets(0);
-        if( !ptr )
-        {
-            ptr = bufferStart();  // FIXIT Why do we need this hack? What is about other parsers JSON/YAML?
-            *ptr = '\0';
-            setEof();
-            return 0;
-        }
-        else
-        {
-            size_t l = strlen(ptr);
-            if( l > 0 && ptr[l-1] != '\n' && ptr[l-1] != '\r' && !eof() )
-            {
-                ptr[l] = '\n';
-                ptr[l+1] = '\0';
-            }
+char *FileStorage::Impl::gets() {
+    char *ptr = this->gets(0);
+    if (!ptr) {
+        ptr = bufferStart();  // FIXIT Why do we need this hack? What is about other parsers JSON/YAML?
+        *ptr = '\0';
+        setEof();
+        return 0;
+    } else {
+        size_t l = strlen(ptr);
+        if (l > 0 && ptr[l - 1] != '\n' && ptr[l - 1] != '\r' && !eof()) {
+            ptr[l] = '\n';
+            ptr[l + 1] = '\0';
         }
-        lineno++;
-        return ptr;
     }
+    lineno++;
+    return ptr;
+}
 
-    bool eof()
-    {
-        if( dummy_eof )
-            return true;
-        if( strbuf )
-            return strbufpos >= strbufsize;
-        if( file )
-            return feof(file) != 0;
+bool FileStorage::Impl::eof() {
+    if (dummy_eof)
+        return true;
+    if (strbuf)
+        return strbufpos >= strbufsize;
+    if (file)
+        return feof(file) != 0;
 #if USE_ZLIB
-        if( gzfile )
-            return gzeof(gzfile) != 0;
+    if (gzfile)
+        return gzeof(gzfile) != 0;
 #endif
-        return false;
-    }
+    return false;
+}
 
-    void setEof()
-    {
-        dummy_eof = true;
-    }
+void FileStorage::Impl::setEof() {
+    dummy_eof = true;
+}
 
-    void closeFile()
-    {
-        if( file )
-            fclose( file );
+void FileStorage::Impl::closeFile() {
+    if (file)
+        fclose(file);
 #if USE_ZLIB
-        else if( gzfile )
-            gzclose( gzfile );
+    else if (gzfile)
+        gzclose(gzfile);
 #endif
-        file = 0;
-        gzfile = 0;
-        strbuf = 0;
-        strbufpos = 0;
-        is_opened = false;
-    }
+    file = 0;
+    gzfile = 0;
+    strbuf = 0;
+    strbufpos = 0;
+    is_opened = false;
+}
 
-    void rewind()
-    {
-        if( file )
-            ::rewind(file);
+void FileStorage::Impl::rewind() {
+    if (file)
+        ::rewind(file);
 #if USE_ZLIB
-        else if( gzfile )
-            gzrewind(gzfile);
+    else if (gzfile)
+        gzrewind(gzfile);
 #endif
-        strbufpos = 0;
-    }
+    strbufpos = 0;
+}
 
-    char* resizeWriteBuffer( char* ptr, int len )
-    {
-        const char* buffer_end = &buffer[0] + buffer.size();
-        if( ptr + len < buffer_end )
-            return ptr;
+char *FileStorage::Impl::resizeWriteBuffer(char *ptr, int len) {
+    const char *buffer_end = &buffer[0] + buffer.size();
+    if (ptr + len < buffer_end)
+        return ptr;
+
+    const char *buffer_start = &buffer[0];
+    int written_len = (int) (ptr - buffer_start);
 
-        const char* buffer_start = &buffer[0];
-        int written_len = (int)(ptr - buffer_start);
+    CV_Assert(written_len <= (int) buffer.size());
+    int new_size = (int) ((buffer_end - buffer_start) * 3 / 2);
+    new_size = MAX(written_len + len, new_size);
+    buffer.reserve(new_size + 256);
+    buffer.resize(new_size);
+    bufofs = written_len;
+    return &buffer[0] + bufofs;
+}
+
+char *FileStorage::Impl::flush() {
+    char *buffer_start = &buffer[0];
+    char *ptr = buffer_start + bufofs;
 
-        CV_Assert(written_len <= (int)buffer.size());
-        int new_size = (int)((buffer_end - buffer_start)*3/2);
-        new_size = MAX( written_len + len, new_size );
-        buffer.reserve( new_size + 256 );
-        buffer.resize( new_size );
-        bufofs = written_len;
-        return &buffer[0] + bufofs;
+    if (ptr > buffer_start + space) {
+        ptr[0] = '\n';
+        ptr[1] = '\0';
+        puts(buffer_start);
+        bufofs = 0;
     }
 
-    char* flush()
-    {
-        char* buffer_start = &buffer[0];
-        char* ptr = buffer_start + bufofs;
+    int indent = write_stack.back().indent;
 
-        if( ptr > buffer_start + space )
-        {
-            ptr[0] = '\n';
-            ptr[1] = '\0';
-            puts( buffer_start );
-            bufofs = 0;
-        }
+    if (space != indent) {
+        memset(buffer_start, ' ', indent);
+        space = indent;
+    }
+    bufofs = space;
+    ptr = buffer_start + bufofs;
 
-        int indent = write_stack.back().indent;
+    return ptr;
+}
 
-        if( space != indent )
-        {
-            memset( buffer_start, ' ', indent );
-            space = indent;
-        }
-        bufofs = space;
-        ptr = buffer_start + bufofs;
+void FileStorage::Impl::endWriteStruct() {
+    CV_Assert(write_mode);
 
-        return ptr;
-    }
+    check_if_write_struct_is_delayed(false);
+    if (state_of_writing_base64 != FileStorage_API::Uncertain)
+        switch_to_Base64_state(FileStorage_API::Uncertain);
 
-    void endWriteStruct()
-    {
-        CV_Assert( write_mode );
-        CV_Assert( !write_stack.empty() );
+    CV_Assert(!write_stack.empty());
 
-        FStructData& current_struct = write_stack.back();
-        if( fmt == FileStorage::FORMAT_JSON && !FileNode::isFlow(current_struct.flags) && write_stack.size() > 1 )
-            current_struct.indent = write_stack[write_stack.size() - 2].indent;
+    FStructData &current_struct = write_stack.back();
+    if (fmt == FileStorage::FORMAT_JSON && !FileNode::isFlow(current_struct.flags) && write_stack.size() > 1)
+        current_struct.indent = write_stack[write_stack.size() - 2].indent;
 
-        emitter->endWriteStruct(current_struct);
+    emitter->endWriteStruct(current_struct);
 
-        write_stack.pop_back();
-        if( !write_stack.empty() )
-            write_stack.back().flags &= ~FileNode::EMPTY;
-    }
+    write_stack.pop_back();
+    if (!write_stack.empty())
+        write_stack.back().flags &= ~FileNode::EMPTY;
+}
 
-    void startWriteStruct( const char* key, int struct_flags,
-                           const char* type_name )
-    {
-        CV_Assert( write_mode );
+void FileStorage::Impl::startWriteStruct_helper(const char *key, int struct_flags,
+                                                const char *type_name) {
+    CV_Assert(write_mode);
 
-        struct_flags = (struct_flags & (FileNode::TYPE_MASK|FileNode::FLOW)) | FileNode::EMPTY;
-        if( !FileNode::isCollection(struct_flags))
-            CV_Error( CV_StsBadArg,
-                     "Some collection type: FileNode::SEQ or FileNode::MAP must be specified" );
+    struct_flags = (struct_flags & (FileNode::TYPE_MASK | FileNode::FLOW)) | FileNode::EMPTY;
+    if (!FileNode::isCollection(struct_flags))
+        CV_Error(cv::Error::StsBadArg,
+                 "Some collection type: FileNode::SEQ or FileNode::MAP must be specified");
 
-        if( type_name && type_name[0] == '\0' )
-            type_name = 0;
+    if (type_name && type_name[0] == '\0')
+        type_name = 0;
 
-        FStructData s = emitter->startWriteStruct( write_stack.back(), key, struct_flags, type_name );
-        write_stack.push_back(s);
-        size_t write_stack_size = write_stack.size();
-        if( write_stack_size > 1 )
-            write_stack[write_stack_size-2].flags &= ~FileNode::EMPTY;
+    FStructData s = emitter->startWriteStruct(write_stack.back(), key, struct_flags, type_name);
 
-        if( !FileNode::isFlow(s.flags) )
-            flush();
+    write_stack.push_back(s);
+    size_t write_stack_size = write_stack.size();
+    if (write_stack_size > 1)
+        write_stack[write_stack_size - 2].flags &= ~FileNode::EMPTY;
 
-        if( fmt == FileStorage::FORMAT_JSON && type_name && type_name[0] && FileNode::isMap(struct_flags))
-        {
-            emitter->write("type_id", type_name, false);
-        }
-    }
+    if (fmt != FileStorage::FORMAT_JSON && !FileNode::isFlow(s.flags))
+        flush();
 
-    void writeComment( const char* comment, bool eol_comment )
-    {
-        CV_Assert(write_mode);
-        emitter->writeComment( comment, eol_comment );
+    if (fmt == FileStorage::FORMAT_JSON && type_name && type_name[0] && FileNode::isMap(struct_flags)) {
+        emitter->write("type_id", type_name, false);
     }
+}
 
-    void startNextStream()
-    {
-        CV_Assert(write_mode);
-        if( !empty_stream )
-        {
-            while( !write_stack.empty() )
-                endWriteStruct();
-            flush();
-            emitter->startNextStream();
-            empty_stream = true;
-            write_stack.push_back(FStructData("", FileNode::EMPTY, 0));
-            bufofs = 0;
-        }
-    }
+void FileStorage::Impl::startWriteStruct(const char *key, int struct_flags,
+                                         const char *type_name) {
+    check_if_write_struct_is_delayed(false);
+    if (state_of_writing_base64 == FileStorage_API::NotUse)
+        switch_to_Base64_state(FileStorage_API::Uncertain);
 
-    void write( const String& key, int value )
-    {
-        CV_Assert(write_mode);
-        emitter->write(key.c_str(), value);
-    }
+    if (state_of_writing_base64 == FileStorage_API::Uncertain && FileNode::isSeq(struct_flags)
+        && is_using_base64 && type_name == 0) {
+        /* Uncertain whether output Base64 data */
+        make_write_struct_delayed(key, struct_flags, type_name);
+    } else if (type_name && memcmp(type_name, "binary", 6) == 0) {
+        /* Must output Base64 data */
+        if ((FileNode::TYPE_MASK & struct_flags) != FileNode::SEQ)
+            CV_Error(cv::Error::StsBadArg, "must set 'struct_flags |= CV_NODE_SEQ' if using Base64.");
+        else if (state_of_writing_base64 != FileStorage_API::Uncertain)
+            CV_Error(cv::Error::StsError, "function \'cvStartWriteStruct\' calls cannot be nested if using Base64.");
 
-    void write( const String& key, double value )
-    {
-        CV_Assert(write_mode);
-        emitter->write(key.c_str(), value);
+        startWriteStruct_helper(key, struct_flags, "binary");
+
+        if (state_of_writing_base64 != FileStorage_API::Uncertain)
+            switch_to_Base64_state(FileStorage_API::Uncertain);
+        switch_to_Base64_state(FileStorage_API::InUse);
+    } else {
+        /* Won't output Base64 data */
+        if (state_of_writing_base64 == FileStorage_API::InUse)
+            CV_Error(cv::Error::StsError, "At the end of the output Base64, `cvEndWriteStruct` is needed.");
+
+        startWriteStruct_helper(key, struct_flags, type_name);
+
+        if (state_of_writing_base64 != FileStorage_API::Uncertain)
+            switch_to_Base64_state(FileStorage_API::Uncertain);
+        switch_to_Base64_state(FileStorage_API::NotUse);
     }
+}
 
-    void write( const String& key, const String& value )
-    {
-        CV_Assert(write_mode);
-        emitter->write(key.c_str(), value.c_str(), false);
+void FileStorage::Impl::writeComment(const char *comment, bool eol_comment) {
+    CV_Assert(write_mode);
+    emitter->writeComment(comment, eol_comment);
+}
+
+void FileStorage::Impl::startNextStream() {
+    CV_Assert(write_mode);
+    if (!empty_stream) {
+        while (!write_stack.empty())
+            endWriteStruct();
+        flush();
+        emitter->startNextStream();
+        empty_stream = true;
+        write_stack.push_back(FStructData("", FileNode::EMPTY, 0));
+        bufofs = 0;
     }
+}
 
-    void writeRawData( const std::string& dt, const void* _data, size_t len )
-    {
-        CV_Assert(write_mode);
+void FileStorage::Impl::write(const String &key, int value) {
+    CV_Assert(write_mode);
+    emitter->write(key.c_str(), value);
+}
 
-        size_t elemSize = fs::calcStructSize(dt.c_str(), 0);
-        CV_Assert( len % elemSize == 0 );
-        len /= elemSize;
+void FileStorage::Impl::write(const String &key, double value) {
+    CV_Assert(write_mode);
+    emitter->write(key.c_str(), value);
+}
 
-        bool explicitZero = fmt == FileStorage::FORMAT_JSON;
-        const uchar* data0 = (const uchar*)_data;
-        int fmt_pairs[CV_FS_MAX_FMT_PAIRS*2], k, fmt_pair_count;
-        char buf[256] = "";
+void FileStorage::Impl::write(const String &key, const String &value) {
+    CV_Assert(write_mode);
+    emitter->write(key.c_str(), value.c_str(), false);
+}
 
-        fmt_pair_count = fs::decodeFormat( dt.c_str(), fmt_pairs, CV_FS_MAX_FMT_PAIRS );
+void FileStorage::Impl::writeRawData(const std::string &dt, const void *_data, size_t len) {
+    CV_Assert(write_mode);
 
-        if( !len )
-            return;
+    if (is_using_base64 || state_of_writing_base64 == FileStorage_API::Base64State::InUse) {
+        writeRawDataBase64(_data, len, dt.c_str());
+        return;
+    } else if (state_of_writing_base64 == FileStorage_API::Base64State::Uncertain) {
+        switch_to_Base64_state(FileStorage_API::Base64State::NotUse);
+    }
 
-        if( !data0 )
-            CV_Error( CV_StsNullPtr, "Null data pointer" );
+    size_t elemSize = fs::calcStructSize(dt.c_str(), 0);
+    CV_Assert(elemSize);
+    CV_Assert(len % elemSize == 0);
+    len /= elemSize;
 
-        if( fmt_pair_count == 1 )
-        {
-            fmt_pairs[0] *= (int)len;
-            len = 1;
-        }
+    bool explicitZero = fmt == FileStorage::FORMAT_JSON;
+    const uchar *data0 = (const uchar *) _data;
+    int fmt_pairs[CV_FS_MAX_FMT_PAIRS * 2], k, fmt_pair_count;
+    char buf[256] = "";
 
-        for(;len--; data0 += elemSize)
-        {
-            int offset = 0;
-            for( k = 0; k < fmt_pair_count; k++ )
-            {
-                int i, count = fmt_pairs[k*2];
-                int elem_type = fmt_pairs[k*2+1];
-                int elem_size = CV_ELEM_SIZE(elem_type);
-                const char *ptr;
+    fmt_pair_count = fs::decodeFormat(dt.c_str(), fmt_pairs, CV_FS_MAX_FMT_PAIRS);
 
-                offset = cvAlign( offset, elem_size );
-                const uchar* data = data0 + offset;
+    if (!len)
+        return;
 
-                for( i = 0; i < count; i++ )
-                {
-                    switch( elem_type )
-                    {
+    if (!data0)
+        CV_Error(cv::Error::StsNullPtr, "Null data pointer");
+
+    if (fmt_pair_count == 1) {
+        fmt_pairs[0] *= (int) len;
+        len = 1;
+    }
+
+    for (; len--; data0 += elemSize) {
+        int offset = 0;
+        for (k = 0; k < fmt_pair_count; k++) {
+            int i, count = fmt_pairs[k * 2];
+            int elem_type = fmt_pairs[k * 2 + 1];
+            int elem_size = CV_ELEM_SIZE(elem_type);
+            const char *ptr;
+
+            offset = cvAlign(offset, elem_size);
+            const uchar *data = data0 + offset;
+
+            for (i = 0; i < count; i++) {
+                switch (elem_type) {
                     case CV_8U:
-                        ptr = fs::itoa( *(uchar*)data, buf, 10 );
+                        ptr = fs::itoa(*(uchar *) data, buf, 10);
                         data++;
                         break;
                     case CV_8S:
-                        ptr = fs::itoa( *(char*)data, buf, 10 );
+                        ptr = fs::itoa(*(char *) data, buf, 10);
                         data++;
                         break;
                     case CV_16U:
-                        ptr = fs::itoa( *(ushort*)data, buf, 10 );
+                        ptr = fs::itoa(*(ushort *) data, buf, 10);
                         data += sizeof(ushort);
                         break;
                     case CV_16S:
-                        ptr = fs::itoa( *(short*)data, buf, 10 );
+                        ptr = fs::itoa(*(short *) data, buf, 10);
                         data += sizeof(short);
                         break;
                     case CV_32S:
-                        ptr = fs::itoa( *(int*)data, buf, 10 );
+                        ptr = fs::itoa(*(int *) data, buf, 10);
                         data += sizeof(int);
                         break;
                     case CV_32F:
-                        ptr = fs::floatToString( buf, *(float*)data, false, explicitZero );
+                        ptr = fs::floatToString(buf, *(float *) data, false, explicitZero);
                         data += sizeof(float);
                         break;
                     case CV_64F:
-                        ptr = fs::doubleToString( buf, *(double*)data, explicitZero );
+                        ptr = fs::doubleToString(buf, *(double *) data, explicitZero);
                         data += sizeof(double);
                         break;
                     case CV_16F: /* reference */
-                        ptr = fs::floatToString( buf, (float)*(float16_t*)data, true, explicitZero );
+                        ptr = fs::floatToString(buf, (float) *(float16_t *) data, true, explicitZero);
                         data += sizeof(float16_t);
                         break;
                     default:
-                        CV_Error( CV_StsUnsupportedFormat, "Unsupported type" );
+                        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported type");
                         return;
-                    }
-
-                    emitter->writeScalar(0, ptr);
                 }
 
-                offset = (int)(data - data0);
+                emitter->writeScalar(0, ptr);
             }
+
+            offset = (int) (data - data0);
         }
     }
+}
 
-    void writeRawDataBase64(const void* /*data*/, int /*len*/, const char* /*dt*/ )
-    {
+void FileStorage::Impl::workaround() {
+    check_if_write_struct_is_delayed(false);
 
-    }
+    if (state_of_writing_base64 != FileStorage_API::Base64State::Uncertain)
+        switch_to_Base64_state(FileStorage_API::Base64State::Uncertain);
+}
 
-    String releaseAndGetString();
+void FileStorage::Impl::switch_to_Base64_state(FileStorage_API::Base64State new_state) {
+    const char *err_unkonwn_state = "Unexpected error, unable to determine the Base64 state.";
+    const char *err_unable_to_switch = "Unexpected error, unable to switch to this state.";
 
-    FileNode getFirstTopLevelNode() const
-    {
-        return roots.empty() ? FileNode() : roots[0];
+    /* like a finite state machine */
+    switch (state_of_writing_base64) {
+        case FileStorage_API::Base64State::Uncertain:
+            switch (new_state) {
+                case FileStorage_API::Base64State::InUse:
+                {
+                    CV_DbgAssert(base64_writer == 0);
+                    bool can_indent = (fmt != cv::FileStorage::Mode::FORMAT_JSON);
+                    base64_writer = new base64::Base64Writer(*this, can_indent);
+                    if (!can_indent) {
+                        char *ptr = bufferPtr();
+                        *ptr++ = '\0';
+                        puts(bufferStart());
+                        setBufferPtr(bufferStart());
+                        memset(bufferStart(), 0, static_cast<int>(space));
+                        puts("\"$base64$");
+                    }
+                    break;
+                }
+                case FileStorage_API::Base64State::Uncertain:
+                    break;
+                case FileStorage_API::Base64State::NotUse:
+                    break;
+                default:
+                    CV_Error(cv::Error::StsError, err_unkonwn_state);
+                    break;
+            }
+            break;
+        case FileStorage_API::Base64State::InUse:
+            switch (new_state) {
+                case FileStorage_API::Base64State::InUse:
+                case FileStorage_API::Base64State::NotUse:
+                    CV_Error(cv::Error::StsError, err_unable_to_switch);
+                    break;
+                case FileStorage_API::Base64State::Uncertain:
+                    delete base64_writer;
+                    base64_writer = 0;
+                    if ( fmt == cv::FileStorage::FORMAT_JSON )
+                    {
+                        puts("\"");
+                        setBufferPtr(bufferStart());
+                        flush();
+                        memset(bufferStart(), 0, static_cast<int>(space) );
+                        setBufferPtr(bufferStart());
+                    }
+                    break;
+                default:
+                    CV_Error(cv::Error::StsError, err_unkonwn_state);
+                    break;
+            }
+            break;
+        case FileStorage_API::Base64State::NotUse:
+            switch (new_state) {
+                case FileStorage_API::Base64State::InUse:
+                case FileStorage_API::Base64State::NotUse:
+                    CV_Error(cv::Error::StsError, err_unable_to_switch);
+                    break;
+                case FileStorage_API::Base64State::Uncertain:
+                    break;
+                default:
+                    CV_Error(cv::Error::StsError, err_unkonwn_state);
+                    break;
+            }
+            break;
+        default:
+            CV_Error(cv::Error::StsError, err_unkonwn_state);
+            break;
     }
 
-    FileNode root(int streamIdx=0) const
-    {
-        return streamIdx >= 0 && streamIdx < (int)roots.size() ? roots[streamIdx] : FileNode();
-    }
+    state_of_writing_base64 = new_state;
+}
 
-    FileNode operator[](const String& nodename) const
-    {
-        return this->operator[](nodename.c_str());
+void FileStorage::Impl::make_write_struct_delayed(const char *key, int struct_flags, const char *type_name) {
+    CV_Assert(is_write_struct_delayed == false);
+    CV_DbgAssert(delayed_struct_key == nullptr);
+    CV_DbgAssert(delayed_struct_flags == 0);
+    CV_DbgAssert(delayed_type_name == nullptr);
+
+    delayed_struct_flags = struct_flags;
+
+    if (key != nullptr) {
+        delayed_struct_key = new char[strlen(key) + 1U];
+        strcpy(delayed_struct_key, key);
     }
 
-    FileNode operator[](const char* /*nodename*/) const
-    {
-        return FileNode();
+    if (type_name != nullptr) {
+        delayed_type_name = new char[strlen(type_name) + 1U];
+        strcpy(delayed_type_name, type_name);
     }
 
-    int getFormat() const { return fmt; }
+    is_write_struct_delayed = true;
+}
 
-    char* bufferPtr() const { return (char*)(&buffer[0] + bufofs); }
-    char* bufferStart() const { return (char*)&buffer[0]; }
-    char* bufferEnd() const { return (char*)(&buffer[0] + buffer.size()); }
-    void setBufferPtr(char* ptr)
-    {
-        char* bufferstart = bufferStart();
-        CV_Assert( ptr >= bufferstart && ptr <= bufferEnd() );
-        bufofs = ptr - bufferstart;
-    }
-    int wrapMargin() const { return wrap_margin; }
+void FileStorage::Impl::check_if_write_struct_is_delayed(bool change_type_to_base64) {
+    if (is_write_struct_delayed) {
+        /* save data to prevent recursive call errors */
+        std::string struct_key;
+        std::string type_name;
+        int struct_flags = delayed_struct_flags;
 
-    FStructData& getCurrentStruct()
-    {
-        CV_Assert(!write_stack.empty());
-        return write_stack.back();
+        if (delayed_struct_key != nullptr && *delayed_struct_key != '\0') {
+            struct_key.assign(delayed_struct_key);
+        }
+        if (delayed_type_name != nullptr && *delayed_type_name != '\0') {
+            type_name.assign(delayed_type_name);
+        }
+
+        /* reset */
+        delete[] delayed_struct_key;
+        delete[] delayed_type_name;
+        delayed_struct_key = nullptr;
+        delayed_struct_flags = 0;
+        delayed_type_name = nullptr;
+
+        is_write_struct_delayed = false;
+
+        /* call */
+        if (change_type_to_base64) {
+            startWriteStruct_helper(struct_key.c_str(), struct_flags, "binary");
+            if (state_of_writing_base64 != FileStorage_API::Uncertain)
+                switch_to_Base64_state(FileStorage_API::Uncertain);
+            switch_to_Base64_state(FileStorage_API::InUse);
+        } else {
+            startWriteStruct_helper(struct_key.c_str(), struct_flags, type_name.c_str());
+            if (state_of_writing_base64 != FileStorage_API::Uncertain)
+                switch_to_Base64_state(FileStorage_API::Uncertain);
+            switch_to_Base64_state(FileStorage_API::NotUse);
+        }
     }
+}
 
-    void setNonEmpty()
-    {
-        empty_stream = false;
+void FileStorage::Impl::writeRawDataBase64(const void *_data, size_t len, const char *dt) {
+    CV_Assert(write_mode);
+
+    check_if_write_struct_is_delayed(true);
+
+    if (state_of_writing_base64 == FileStorage_API::Base64State::Uncertain) {
+        switch_to_Base64_state(FileStorage_API::Base64State::InUse);
+    } else if (state_of_writing_base64 != FileStorage_API::Base64State::InUse) {
+        CV_Error(cv::Error::StsError, "Base64 should not be used at present.");
     }
 
-    void processSpecialDouble( char* buf, double* value, char** endptr )
-    {
-        FileStorage_API* fs = this;
-        char c = buf[0];
-        int inf_hi = 0x7ff00000;
+    base64_writer->write(_data, len, dt);
+}
 
-        if( c == '-' || c == '+' )
-        {
-            inf_hi = c == '-' ? 0xfff00000 : 0x7ff00000;
-            c = *++buf;
-        }
+FileNode FileStorage::Impl::getFirstTopLevelNode() const {
+    return roots.empty() ? FileNode() : roots[0];
+}
 
-        if( c != '.' )
-            CV_PARSE_ERROR_CPP( "Bad format of floating-point constant" );
+FileNode FileStorage::Impl::root(int streamIdx) const {
+    return streamIdx >= 0 && streamIdx < (int) roots.size() ? roots[streamIdx] : FileNode();
+}
 
-        Cv64suf v;
-        v.f = 0.;
-        if( toupper(buf[1]) == 'I' && toupper(buf[2]) == 'N' && toupper(buf[3]) == 'F' )
-            v.u = (uint64)inf_hi << 32;
-        else if( toupper(buf[1]) == 'N' && toupper(buf[2]) == 'A' && toupper(buf[3]) == 'N' )
-            v.u = (uint64)-1;
-        else
-            CV_PARSE_ERROR_CPP( "Bad format of floating-point constant" );
-        *value = v.f;
-        *endptr = buf + 4;
-    }
+FileNode FileStorage::Impl::operator[](const String &nodename) const {
+    return this->operator[](nodename.c_str());
+}
 
-    double strtod( char* ptr, char** endptr )
-    {
-        double fval = ::strtod( ptr, endptr );
-        if( **endptr == '.' )
-        {
-            char* dot_pos = *endptr;
-            *dot_pos = ',';
-            double fval2 = ::strtod( ptr, endptr );
-            *dot_pos = '.';
-            if( *endptr > dot_pos )
-                fval = fval2;
-            else
-                *endptr = dot_pos;
-        }
+FileNode FileStorage::Impl::operator[](const char * /*nodename*/) const {
+    return FileNode();
+}
 
-        if( *endptr == ptr || cv_isalpha(**endptr) )
-            processSpecialDouble( ptr, &fval, endptr );
+int FileStorage::Impl::getFormat() const { return fmt; }
 
-        return fval;
-    }
+char *FileStorage::Impl::bufferPtr() const { return (char *) (&buffer[0] + bufofs); }
 
-    void convertToCollection(int type, FileNode& node)
-    {
-        CV_Assert( type == FileNode::SEQ || type == FileNode::MAP );
+char *FileStorage::Impl::bufferStart() const { return (char *) &buffer[0]; }
 
-        int node_type = node.type();
-        if( node_type == type )
-            return;
+char *FileStorage::Impl::bufferEnd() const { return (char *) (&buffer[0] + buffer.size()); }
 
-        bool named = node.isNamed();
-        uchar* ptr = node.ptr() + 1 + (named ? 4 : 0);
+void FileStorage::Impl::setBufferPtr(char *ptr) {
+    char *bufferstart = bufferStart();
+    CV_Assert(ptr >= bufferstart && ptr <= bufferEnd());
+    bufofs = ptr - bufferstart;
+}
 
-        int ival = 0;
-        double fval = 0;
-        std::string sval;
-        bool add_first_scalar = false;
+int FileStorage::Impl::wrapMargin() const { return wrap_margin; }
 
-        if( node_type != FileNode::NONE )
-        {
-            // scalar nodes can only be converted to sequences, e.g. in XML:
-            // <a>5[parser_position]... => create 5 with name "a"
-            // <a>5 6[parser_position]... => 5 is converted to [5] and then 6 is added to it
-            //
-            // otherwise we don't know where to get the element names from
-            CV_Assert( type == FileNode::SEQ );
-            if( node_type == FileNode::INT )
-            {
-                ival = readInt(ptr);
-                add_first_scalar = true;
-            }
-            else if( node_type == FileNode::REAL )
-            {
-                fval = readReal(ptr);
-                add_first_scalar = true;
-            }
-            else if( node_type == FileNode::STRING )
-            {
-                sval = std::string(node);
-                add_first_scalar = true;
-            }
-            else
-                CV_Error_(Error::StsError, ("The node of type %d cannot be converted to collection", node_type));
-        }
+FStructData &FileStorage::Impl::getCurrentStruct() {
+    CV_Assert(!write_stack.empty());
+    return write_stack.back();
+}
 
-        ptr = reserveNodeSpace(node, 1 + (named ? 4 : 0) + 4 + 4);
-        *ptr++ = (uchar)(type | (named ? FileNode::NAMED : 0));
-        // name has been copied automatically
-        if( named )
-            ptr += 4;
-        // set raw_size(collection)==4, nelems(collection)==1
-        writeInt(ptr, 4);
-        writeInt(ptr + 4, 0);
-
-        if( add_first_scalar )
-            addNode(node, std::string(), node_type,
-                    node_type == FileNode::INT ? (const void*)&ival :
-                    node_type == FileNode::REAL ? (const void*)&fval :
-                    node_type == FileNode::STRING ? (const void*)sval.c_str() : 0,
-                    -1);
-    }
-
-    // a) allocates new FileNode (for that just set blockIdx to the last block and ofs to freeSpaceOfs) or
-    // b) reallocates just created new node (blockIdx and ofs must be taken from FileNode).
-    //    If there is no enough space in the current block (it should be the last block added so far),
-    //    the last block is shrunk so that it ends immediately before the reallocated node. Then,
-    //    a new block of sufficient size is allocated and the FileNode is placed in the beginning of it.
-    // The case (a) can be used to allocate the very first node by setting blockIdx == ofs == 0.
-    // In the case (b) the existing tag and the name are copied automatically.
-    uchar* reserveNodeSpace(FileNode& node, size_t sz)
-    {
-        bool shrinkBlock = false;
-        size_t shrinkBlockIdx = 0, shrinkSize = 0;
+void FileStorage::Impl::setNonEmpty() {
+    empty_stream = false;
+}
 
-        uchar *ptr = 0, *blockEnd = 0;
+void FileStorage::Impl::processSpecialDouble(char *buf, double *value, char **endptr) {
+    FileStorage_API *fs = this;
+    char c = buf[0];
+    int inf_hi = 0x7ff00000;
 
-        if( !fs_data_ptrs.empty() )
-        {
-            size_t blockIdx = node.blockIdx;
-            size_t ofs = node.ofs;
-            CV_Assert( blockIdx == fs_data_ptrs.size()-1 );
-            CV_Assert( ofs <= fs_data_blksz[blockIdx] );
-            CV_Assert( freeSpaceOfs <= fs_data_blksz[blockIdx] );
-            //CV_Assert( freeSpaceOfs <= ofs + sz );
-
-            ptr = fs_data_ptrs[blockIdx] + ofs;
-            blockEnd = fs_data_ptrs[blockIdx] + fs_data_blksz[blockIdx];
-
-            CV_Assert(ptr >= fs_data_ptrs[blockIdx] && ptr <= blockEnd);
-            if( ptr + sz <= blockEnd )
-            {
-                freeSpaceOfs = ofs + sz;
-                return ptr;
-            }
+    if (c == '-' || c == '+') {
+        inf_hi = c == '-' ? 0xfff00000 : 0x7ff00000;
+        c = *++buf;
+    }
 
-            if (ofs == 0)  // FileNode is a first component of this block. Resize current block instead of allocation of new one.
-            {
-                fs_data[blockIdx]->resize(sz);
-                ptr = &fs_data[blockIdx]->at(0);
-                fs_data_ptrs[blockIdx] = ptr;
-                fs_data_blksz[blockIdx] = sz;
-                freeSpaceOfs = sz;
-                return ptr;
-            }
+    if (c != '.')
+        CV_PARSE_ERROR_CPP("Bad format of floating-point constant");
 
-            shrinkBlock = true;
-            shrinkBlockIdx = blockIdx;
-            shrinkSize = ofs;
+    Cv64suf v;
+    v.f = 0.;
+    if (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'N' && toupper(buf[3]) == 'F')
+        v.u = (uint64) inf_hi << 32;
+    else if (toupper(buf[1]) == 'N' && toupper(buf[2]) == 'A' && toupper(buf[3]) == 'N')
+        v.u = (uint64) -1;
+    else
+        CV_PARSE_ERROR_CPP("Bad format of floating-point constant");
+    *value = v.f;
+    *endptr = buf + 4;
+}
+
+double FileStorage::Impl::strtod(char *ptr, char **endptr) {
+    double fval = ::strtod(ptr, endptr);
+    if (**endptr == '.') {
+        char *dot_pos = *endptr;
+        *dot_pos = ',';
+        double fval2 = ::strtod(ptr, endptr);
+        *dot_pos = '.';
+        if (*endptr > dot_pos)
+            fval = fval2;
+        else
+            *endptr = dot_pos;
+    }
+
+    if (*endptr == ptr || cv_isalpha(**endptr))
+        processSpecialDouble(ptr, &fval, endptr);
+
+    return fval;
+}
+
+void FileStorage::Impl::convertToCollection(int type, FileNode &node) {
+    CV_Assert(type == FileNode::SEQ || type == FileNode::MAP);
+
+    int node_type = node.type();
+    if (node_type == type)
+        return;
+
+    bool named = node.isNamed();
+    uchar *ptr = node.ptr() + 1 + (named ? 4 : 0);
+
+    int ival = 0;
+    double fval = 0;
+    std::string sval;
+    bool add_first_scalar = false;
+
+    if (node_type != FileNode::NONE) {
+        // scalar nodes can only be converted to sequences, e.g. in XML:
+        // <a>5[parser_position]... => create 5 with name "a"
+        // <a>5 6[parser_position]... => 5 is converted to [5] and then 6 is added to it
+        //
+        // otherwise we don't know where to get the element names from
+        CV_Assert(type == FileNode::SEQ);
+        if (node_type == FileNode::INT) {
+            ival = readInt(ptr);
+            add_first_scalar = true;
+        } else if (node_type == FileNode::REAL) {
+            fval = readReal(ptr);
+            add_first_scalar = true;
+        } else if (node_type == FileNode::STRING) {
+            sval = std::string(node);
+            add_first_scalar = true;
+        } else
+            CV_Error_(Error::StsError, ("The node of type %d cannot be converted to collection", node_type));
+    }
+
+    ptr = reserveNodeSpace(node, 1 + (named ? 4 : 0) + 4 + 4);
+    *ptr++ = (uchar) (type | (named ? FileNode::NAMED : 0));
+    // name has been copied automatically
+    if (named)
+        ptr += 4;
+    // set raw_size(collection)==4, nelems(collection)==1
+    writeInt(ptr, 4);
+    writeInt(ptr + 4, 0);
+
+    if (add_first_scalar)
+        addNode(node, std::string(), node_type,
+                node_type == FileNode::INT ? (const void *) &ival :
+                node_type == FileNode::REAL ? (const void *) &fval :
+                node_type == FileNode::STRING ? (const void *) sval.c_str() : 0,
+                -1);
+}
+
+// a) allocates new FileNode (for that just set blockIdx to the last block and ofs to freeSpaceOfs) or
+// b) reallocates just created new node (blockIdx and ofs must be taken from FileNode).
+//    If there is no enough space in the current block (it should be the last block added so far),
+//    the last block is shrunk so that it ends immediately before the reallocated node. Then,
+//    a new block of sufficient size is allocated and the FileNode is placed in the beginning of it.
+// The case (a) can be used to allocate the very first node by setting blockIdx == ofs == 0.
+// In the case (b) the existing tag and the name are copied automatically.
+uchar *FileStorage::Impl::reserveNodeSpace(FileNode &node, size_t sz) {
+    bool shrinkBlock = false;
+    size_t shrinkBlockIdx = 0, shrinkSize = 0;
+
+    uchar *ptr = 0, *blockEnd = 0;
+
+    if (!fs_data_ptrs.empty()) {
+        size_t blockIdx = node.blockIdx;
+        size_t ofs = node.ofs;
+        CV_Assert(blockIdx == fs_data_ptrs.size() - 1);
+        CV_Assert(ofs <= fs_data_blksz[blockIdx]);
+        CV_Assert(freeSpaceOfs <= fs_data_blksz[blockIdx]);
+        //CV_Assert( freeSpaceOfs <= ofs + sz );
+
+        ptr = fs_data_ptrs[blockIdx] + ofs;
+        blockEnd = fs_data_ptrs[blockIdx] + fs_data_blksz[blockIdx];
+
+        CV_Assert(ptr >= fs_data_ptrs[blockIdx] && ptr <= blockEnd);
+        if (ptr + sz <= blockEnd) {
+            freeSpaceOfs = ofs + sz;
+            return ptr;
         }
 
-        size_t blockSize = std::max((size_t)CV_FS_MAX_LEN*4 - 256, sz) + 256;
-        Ptr<std::vector<uchar> > pv = makePtr<std::vector<uchar> >(blockSize);
-        fs_data.push_back(pv);
-        uchar* new_ptr = &pv->at(0);
-        fs_data_ptrs.push_back(new_ptr);
-        fs_data_blksz.push_back(blockSize);
-        node.blockIdx = fs_data_ptrs.size()-1;
-        node.ofs = 0;
-        freeSpaceOfs = sz;
-
-        if( ptr && ptr + 5 <= blockEnd )
+        if (ofs ==
+            0)  // FileNode is a first component of this block. Resize current block instead of allocation of new one.
         {
-            new_ptr[0] = ptr[0];
-            if( ptr[0] & FileNode::NAMED )
-            {
-                new_ptr[1] = ptr[1];
-                new_ptr[2] = ptr[2];
-                new_ptr[3] = ptr[3];
-                new_ptr[4] = ptr[4];
-            }
+            fs_data[blockIdx]->resize(sz);
+            ptr = &fs_data[blockIdx]->at(0);
+            fs_data_ptrs[blockIdx] = ptr;
+            fs_data_blksz[blockIdx] = sz;
+            freeSpaceOfs = sz;
+            return ptr;
         }
 
-        if (shrinkBlock)
-        {
-            fs_data[shrinkBlockIdx]->resize(shrinkSize);
-            fs_data_blksz[shrinkBlockIdx] = shrinkSize;
+        shrinkBlock = true;
+        shrinkBlockIdx = blockIdx;
+        shrinkSize = ofs;
+    }
+
+    size_t blockSize = std::max((size_t) CV_FS_MAX_LEN * 4 - 256, sz) + 256;
+    Ptr<std::vector<uchar> > pv = makePtr<std::vector<uchar> >(blockSize);
+    fs_data.push_back(pv);
+    uchar *new_ptr = &pv->at(0);
+    fs_data_ptrs.push_back(new_ptr);
+    fs_data_blksz.push_back(blockSize);
+    node.blockIdx = fs_data_ptrs.size() - 1;
+    node.ofs = 0;
+    freeSpaceOfs = sz;
+
+    if (ptr && ptr + 5 <= blockEnd) {
+        new_ptr[0] = ptr[0];
+        if (ptr[0] & FileNode::NAMED) {
+            new_ptr[1] = ptr[1];
+            new_ptr[2] = ptr[2];
+            new_ptr[3] = ptr[3];
+            new_ptr[4] = ptr[4];
         }
-
-        return new_ptr;
     }
 
-    unsigned getStringOfs( const std::string& key ) const
-    {
-        str_hash_t::const_iterator it = str_hash.find(key);
-        return it != str_hash.end() ? it->second : 0;
+    if (shrinkBlock) {
+        fs_data[shrinkBlockIdx]->resize(shrinkSize);
+        fs_data_blksz[shrinkBlockIdx] = shrinkSize;
     }
 
-    FileNode addNode( FileNode& collection, const std::string& key,
-                       int elem_type, const void* value, int len )
-    {
-        FileStorage_API* fs = this;
-        bool noname = key.empty() || (fmt == FileStorage::FORMAT_XML && strcmp(key.c_str(), "_") == 0);
-        convertToCollection( noname ? FileNode::SEQ : FileNode::MAP, collection );
-
-        bool isseq = collection.empty() ? false : collection.isSeq();
-        if( noname != isseq )
-            CV_PARSE_ERROR_CPP( noname ? "Map element should have a name" :
-                                "Sequence element should not have name (use <_></_>)" );
-        unsigned strofs = 0;
-        if( !noname )
-        {
-            strofs = getStringOfs(key);
-            if( !strofs )
-            {
-                strofs = (unsigned)str_hash_data.size();
-                size_t keysize = key.size() + 1;
-                str_hash_data.resize(strofs + keysize);
-                memcpy(&str_hash_data[0] + strofs, &key[0], keysize);
-                str_hash.insert(std::make_pair(key, strofs));
-            }
-        }
+    return new_ptr;
+}
 
-        uchar* cp = collection.ptr();
+unsigned FileStorage::Impl::getStringOfs(const std::string &key) const {
+    str_hash_t::const_iterator it = str_hash.find(key);
+    return it != str_hash.end() ? it->second : 0;
+}
 
-        size_t blockIdx = fs_data_ptrs.size() - 1;
-        size_t ofs = freeSpaceOfs;
-        FileNode node(fs_ext, blockIdx, ofs);
+FileNode FileStorage::Impl::addNode(FileNode &collection, const std::string &key,
+                                    int elem_type, const void *value, int len) {
+    FileStorage_API *fs = this;
+    bool noname = key.empty() || (fmt == FileStorage::FORMAT_XML && strcmp(key.c_str(), "_") == 0);
+    convertToCollection(noname ? FileNode::SEQ : FileNode::MAP, collection);
 
-        size_t sz0 = 1 + (noname ? 0 : 4) + 8;
-        uchar* ptr = reserveNodeSpace(node, sz0);
+    bool isseq = collection.empty() ? false : collection.isSeq();
+    if (noname != isseq)
+        CV_PARSE_ERROR_CPP(noname ? "Map element should have a name" :
+                           "Sequence element should not have name (use <_></_>)");
+    unsigned strofs = 0;
+    if (!noname) {
+        strofs = getStringOfs(key);
+        if (!strofs) {
+            strofs = (unsigned) str_hash_data.size();
+            size_t keysize = key.size() + 1;
+            str_hash_data.resize(strofs + keysize);
+            memcpy(&str_hash_data[0] + strofs, &key[0], keysize);
+            str_hash.insert(std::make_pair(key, strofs));
+        }
+    }
 
-        *ptr++ = (uchar)(elem_type | (noname ? 0 : FileNode::NAMED));
-        if( elem_type == FileNode::NONE )
-            freeSpaceOfs -= 8;
+    uchar *cp = collection.ptr();
 
-        if( !noname )
-        {
-            writeInt(ptr, (int)strofs);
-            ptr += 4;
-        }
+    size_t blockIdx = fs_data_ptrs.size() - 1;
+    size_t ofs = freeSpaceOfs;
+    FileNode node(fs_ext, blockIdx, ofs);
 
-        if( elem_type == FileNode::SEQ || elem_type == FileNode::MAP )
-        {
-            writeInt(ptr, 4);
-            writeInt(ptr, 0);
-        }
+    size_t sz0 = 1 + (noname ? 0 : 4) + 8;
+    uchar *ptr = reserveNodeSpace(node, sz0);
 
-        if( value )
-            node.setValue(elem_type, value, len);
+    *ptr++ = (uchar) (elem_type | (noname ? 0 : FileNode::NAMED));
+    if (elem_type == FileNode::NONE)
+        freeSpaceOfs -= 8;
 
-        if( collection.isNamed() )
-            cp += 4;
-        int nelems = readInt(cp + 5);
-        writeInt(cp + 5, nelems + 1);
+    if (!noname) {
+        writeInt(ptr, (int) strofs);
+        ptr += 4;
+    }
 
-        return node;
+    if (elem_type == FileNode::SEQ || elem_type == FileNode::MAP) {
+        writeInt(ptr, 4);
+        writeInt(ptr, 0);
     }
 
-    void finalizeCollection( FileNode& collection )
-    {
-        if( !collection.isSeq() && !collection.isMap() )
-            return;
-        uchar* ptr0 = collection.ptr(), *ptr = ptr0 + 1;
-        if( *ptr0 & FileNode::NAMED )
-            ptr += 4;
-        size_t blockIdx = collection.blockIdx;
-        size_t ofs = collection.ofs + (size_t)(ptr + 8 - ptr0);
-        size_t rawSize = 4;
-        unsigned sz = (unsigned)readInt(ptr + 4);
-        if( sz > 0 )
-        {
-            size_t lastBlockIdx = fs_data_ptrs.size() - 1;
+    if (value)
+        node.setValue(elem_type, value, len);
 
-            for( ; blockIdx < lastBlockIdx; blockIdx++ )
-            {
-                rawSize += fs_data_blksz[blockIdx] - ofs;
-                ofs = 0;
-            }
+    if (collection.isNamed())
+        cp += 4;
+    int nelems = readInt(cp + 5);
+    writeInt(cp + 5, nelems + 1);
+
+    return node;
+}
+
+void FileStorage::Impl::finalizeCollection(FileNode &collection) {
+    if (!collection.isSeq() && !collection.isMap())
+        return;
+    uchar *ptr0 = collection.ptr(), *ptr = ptr0 + 1;
+    if (*ptr0 & FileNode::NAMED)
+        ptr += 4;
+    size_t blockIdx = collection.blockIdx;
+    size_t ofs = collection.ofs + (size_t) (ptr + 8 - ptr0);
+    size_t rawSize = 4;
+    unsigned sz = (unsigned) readInt(ptr + 4);
+    if (sz > 0) {
+        size_t lastBlockIdx = fs_data_ptrs.size() - 1;
+
+        for (; blockIdx < lastBlockIdx; blockIdx++) {
+            rawSize += fs_data_blksz[blockIdx] - ofs;
+            ofs = 0;
         }
-        rawSize += freeSpaceOfs - ofs;
-        writeInt(ptr, (int)rawSize);
     }
+    rawSize += freeSpaceOfs - ofs;
+    writeInt(ptr, (int) rawSize);
+}
 
-    void normalizeNodeOfs(size_t& blockIdx, size_t& ofs) const
-    {
-        while( ofs >= fs_data_blksz[blockIdx] )
-        {
-            if( blockIdx == fs_data_blksz.size() - 1 )
-            {
-                CV_Assert( ofs == fs_data_blksz[blockIdx] );
-                break;
-            }
-            ofs -= fs_data_blksz[blockIdx];
-            blockIdx++;
+void FileStorage::Impl::normalizeNodeOfs(size_t &blockIdx, size_t &ofs) const {
+    while (ofs >= fs_data_blksz[blockIdx]) {
+        if (blockIdx == fs_data_blksz.size() - 1) {
+            CV_Assert(ofs == fs_data_blksz[blockIdx]);
+            break;
         }
+        ofs -= fs_data_blksz[blockIdx];
+        blockIdx++;
     }
+}
 
-    class Base64Decoder
-    {
-    public:
-        Base64Decoder() { ofs = 0; ptr = 0; indent = 0; totalchars = 0; eos = true; }
-        void init(Ptr<FileStorageParser>& _parser, char* _ptr, int _indent)
-        {
-            parser = _parser;
-            ptr = _ptr;
-            indent = _indent;
-            encoded.clear();
-            decoded.clear();
-            ofs = 0;
-            totalchars = 0;
-            eos = false;
-        }
+FileStorage::Impl::Base64State FileStorage::Impl::get_state_of_writing_base64() {
+    return state_of_writing_base64;
+}
 
-        bool readMore(int needed)
-        {
-            static const uchar base64tab[] =
+int FileStorage::Impl::get_space() {
+    return space;
+}
+
+
+FileStorage::Impl::Base64Decoder::Base64Decoder() {
+    ofs = 0;
+    ptr = 0;
+    indent = 0;
+    totalchars = 0;
+    eos = true;
+}
+
+void FileStorage::Impl::Base64Decoder::init(Ptr<FileStorageParser> &_parser, char *_ptr, int _indent) {
+    parser = _parser;
+    ptr = _ptr;
+    indent = _indent;
+    encoded.clear();
+    decoded.clear();
+    ofs = 0;
+    totalchars = 0;
+    eos = false;
+}
+
+bool FileStorage::Impl::Base64Decoder::readMore(int needed) {
+    static const uchar base64tab[] =
             {
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 62,  0,  0,  0, 63,
-               52, 53, 54, 55, 56, 57, 58, 59, 60, 61,  0,  0,  0,  0,  0,  0,
-                0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  0,  0,  0,  0,  0,
-                0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-               41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
+                    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
+                    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+                    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
             };
 
-            if( eos )
-                return false;
-
-            size_t sz = decoded.size();
-            CV_Assert( ofs <= sz );
-            sz -= ofs;
-            for( size_t i = 0; i < sz; i++ )
-                decoded[i] = decoded[ofs + i];
+    if (eos)
+        return false;
 
-            decoded.resize(sz);
-            ofs = 0;
+    size_t sz = decoded.size();
+    CV_Assert(ofs <= sz);
+    sz -= ofs;
+    for (size_t i = 0; i < sz; i++)
+        decoded[i] = decoded[ofs + i];
 
-            CV_Assert( !parser.empty() && ptr );
-            char *beg = 0, *end = 0;
-            bool ok = parser->getBase64Row(ptr, indent, beg, end);
-            ptr = end;
-            std::copy(beg, end, std::back_inserter(encoded));
-            totalchars += end - beg;
+    decoded.resize(sz);
+    ofs = 0;
 
-            if( !ok || beg == end )
-            {
-                // in the end of base64 sequence pad it with '=' characters so that
-                // its total length is multiple of
-                eos = true;
-                size_t tc = totalchars;
-                for( ; tc % 4 != 0; tc++ )
-                    encoded.push_back('=');
-            }
+    CV_Assert(!parser.empty() && ptr);
+    char *beg = 0, *end = 0;
+    bool ok = parser->getBase64Row(ptr, indent, beg, end);
+    ptr = end;
+    std::copy(beg, end, std::back_inserter(encoded));
+    totalchars += end - beg;
+
+    if (!ok || beg == end) {
+        // in the end of base64 sequence pad it with '=' characters so that
+        // its total length is multiple of
+        eos = true;
+        size_t tc = totalchars;
+        for (; tc % 4 != 0; tc++)
+            encoded.push_back('=');
+    }
+
+    int i = 0, j, n = (int) encoded.size();
+    if (n > 0) {
+        const uchar *tab = base64tab;
+        char *src = &encoded[0];
+
+        for (; i <= n - 4; i += 4) {
+            // dddddd cccccc bbbbbb aaaaaa => ddddddcc ccccbbbb bbaaaaaa
+            uchar d = tab[(int) (uchar) src[i]], c = tab[(int) (uchar) src[i + 1]];
+            uchar b = tab[(int) (uchar) src[i + 2]], a = tab[(int) (uchar) src[i + 3]];
+
+            decoded.push_back((uchar) ((d << 2) | (c >> 4)));
+            decoded.push_back((uchar) ((c << 4) | (b >> 2)));
+            decoded.push_back((uchar) ((b << 6) | a));
+        }
+    }
 
-            int i = 0, j, n = (int)encoded.size();
-            if( n > 0 )
-            {
-                const uchar* tab = base64tab;
-                char* src = &encoded[0];
+    if (i > 0 && encoded[i - 1] == '=') {
+        if (i > 1 && encoded[i - 2] == '=' && !decoded.empty())
+            decoded.pop_back();
+        if (!decoded.empty())
+            decoded.pop_back();
+    }
 
-                for( ; i <= n - 4; i += 4 )
-                {
-                    // dddddd cccccc bbbbbb aaaaaa => ddddddcc ccccbbbb bbaaaaaa
-                    uchar d = tab[(int)(uchar)src[i]], c = tab[(int)(uchar)src[i+1]];
-                    uchar b = tab[(int)(uchar)src[i+2]], a = tab[(int)(uchar)src[i+3]];
+    n -= i;
+    for (j = 0; j < n; j++)
+        encoded[j] = encoded[i + j];
+    encoded.resize(n);
 
-                    decoded.push_back((uchar)((d << 2) | (c >> 4)));
-                    decoded.push_back((uchar)((c << 4) | (b >> 2)));
-                    decoded.push_back((uchar)((b << 6) | a));
-                }
-            }
+    return (int) decoded.size() >= needed;
+}
 
-            if( i > 0 && encoded[i-1] == '=' )
-            {
-                if( i > 1 && encoded[i-2] == '=' && !decoded.empty() )
-                    decoded.pop_back();
-                if( !decoded.empty() )
-                    decoded.pop_back();
-            }
+uchar FileStorage::Impl::Base64Decoder::getUInt8() {
+    size_t sz = decoded.size();
+    if (ofs >= sz && !readMore(1))
+        return (uchar) 0;
+    return decoded[ofs++];
+}
 
-            n -= i;
-            for( j = 0; j < n; j++ )
-                encoded[j] = encoded[i + j];
-            encoded.resize(n);
+ushort FileStorage::Impl::Base64Decoder::getUInt16() {
+    size_t sz = decoded.size();
+    if (ofs + 2 > sz && !readMore(2))
+        return (ushort) 0;
+    ushort val = (decoded[ofs] + (decoded[ofs + 1] << 8));
+    ofs += 2;
+    return val;
+}
 
-            return (int)decoded.size() >= needed;
-        }
+int FileStorage::Impl::Base64Decoder::getInt32() {
+    size_t sz = decoded.size();
+    if (ofs + 4 > sz && !readMore(4))
+        return 0;
+    int ival = readInt(&decoded[ofs]);
+    ofs += 4;
+    return ival;
+}
 
-        uchar getUInt8()
-        {
-            size_t sz = decoded.size();
-            if( ofs >= sz && !readMore(1) )
-                return (uchar)0;
-            return decoded[ofs++];
-        }
+double FileStorage::Impl::Base64Decoder::getFloat64() {
+    size_t sz = decoded.size();
+    if (ofs + 8 > sz && !readMore(8))
+        return 0;
+    double fval = readReal(&decoded[ofs]);
+    ofs += 8;
+    return fval;
+}
 
-        ushort getUInt16()
-        {
-            size_t sz = decoded.size();
-            if( ofs + 2 > sz && !readMore(2) )
-                return (ushort)0;
-            ushort val = (decoded[ofs] + (decoded[ofs + 1] << 8));
-            ofs += 2;
-            return val;
-        }
+bool FileStorage::Impl::Base64Decoder::endOfStream() const { return eos; }
 
-        int getInt32()
-        {
-            size_t sz = decoded.size();
-            if( ofs + 4 > sz && !readMore(4) )
-                return 0;
-            int ival = readInt(&decoded[ofs]);
-            ofs += 4;
-            return ival;
-        }
+char *FileStorage::Impl::Base64Decoder::getPtr() const { return ptr; }
 
-        double getFloat64()
-        {
-            size_t sz = decoded.size();
-            if( ofs + 8 > sz && !readMore(8) )
-                return 0;
-            double fval = readReal(&decoded[ofs]);
-            ofs += 8;
-            return fval;
-        }
 
-        bool endOfStream() const { return eos; }
-        char* getPtr() const { return ptr; }
-    protected:
-
-        Ptr<FileStorageParser> parser;
-        char* ptr;
-        int indent;
-        std::vector<char> encoded;
-        std::vector<uchar> decoded;
-        size_t ofs;
-        size_t totalchars;
-        bool eos;
-    };
-
-    char* parseBase64(char* ptr, int indent, FileNode& collection)
-    {
-        const int BASE64_HDR_SIZE = 24;
-        char dt[BASE64_HDR_SIZE+1] = {0};
-        base64decoder.init(parser, ptr, indent);
+char *FileStorage::Impl::parseBase64(char *ptr, int indent, FileNode &collection) {
+    const int BASE64_HDR_SIZE = 24;
+    char dt[BASE64_HDR_SIZE + 1] = {0};
+    base64decoder.init(parser, ptr, indent);
 
-        int i, k;
+    int i, k;
 
-        for( i = 0; i < BASE64_HDR_SIZE; i++ )
-            dt[i] = (char)base64decoder.getUInt8();
-        for( i = 0; i < BASE64_HDR_SIZE; i++ )
-            if( isspace(dt[i]))
-                break;
-        dt[i] = '\0';
+    for (i = 0; i < BASE64_HDR_SIZE; i++)
+        dt[i] = (char) base64decoder.getUInt8();
+    for (i = 0; i < BASE64_HDR_SIZE; i++)
+        if (isspace(dt[i]))
+            break;
+    dt[i] = '\0';
 
-        CV_Assert( !base64decoder.endOfStream() );
+    CV_Assert(!base64decoder.endOfStream());
 
-        int fmt_pairs[CV_FS_MAX_FMT_PAIRS*2];
-        int fmt_pair_count = fs::decodeFormat( dt, fmt_pairs, CV_FS_MAX_FMT_PAIRS );
-        int ival = 0;
-        double fval = 0;
+    int fmt_pairs[CV_FS_MAX_FMT_PAIRS * 2];
+    int fmt_pair_count = fs::decodeFormat(dt, fmt_pairs, CV_FS_MAX_FMT_PAIRS);
+    int ival = 0;
+    double fval = 0;
 
-        for(;;)
-        {
-            for( k = 0; k < fmt_pair_count; k++ )
-            {
-                int elem_type = fmt_pairs[k*2+1];
-                int count = fmt_pairs[k*2];
+    for (;;) {
+        for (k = 0; k < fmt_pair_count; k++) {
+            int elem_type = fmt_pairs[k * 2 + 1];
+            int count = fmt_pairs[k * 2];
 
-                for( i = 0; i < count; i++ )
-                {
-                    int node_type = FileNode::INT;
-                    switch( elem_type )
-                    {
+            for (i = 0; i < count; i++) {
+                int node_type = FileNode::INT;
+                switch (elem_type) {
                     case CV_8U:
                         ival = base64decoder.getUInt8();
                         break;
                     case CV_8S:
-                        ival = (char)base64decoder.getUInt8();
+                        ival = (char) base64decoder.getUInt8();
                         break;
                     case CV_16U:
                         ival = base64decoder.getUInt16();
                         break;
                     case CV_16S:
-                        ival = (short)base64decoder.getUInt16();
+                        ival = (short) base64decoder.getUInt16();
                         break;
                     case CV_32S:
                         ival = base64decoder.getInt32();
                         break;
-                    case CV_32F:
-                        {
+                    case CV_32F: {
                         Cv32suf v;
                         v.i = base64decoder.getInt32();
                         fval = v.f;
                         node_type = FileNode::REAL;
-                        }
+                    }
                         break;
                     case CV_64F:
                         fval = base64decoder.getFloat64();
                         node_type = FileNode::REAL;
                         break;
                     case CV_16F:
-                        fval = (float)float16_t::fromBits(base64decoder.getUInt16());
+                        fval = (float) float16_t::fromBits(base64decoder.getUInt16());
                         node_type = FileNode::REAL;
                         break;
                     default:
-                        CV_Error( Error::StsUnsupportedFormat, "Unsupported type" );
-                    }
-
-                    if( base64decoder.endOfStream() )
-                        break;
-                    addNode(collection, std::string(), node_type,
-                            node_type == FileNode::INT ? (void*)&ival : (void*)&fval, -1);
+                        CV_Error(Error::StsUnsupportedFormat, "Unsupported type");
                 }
+
+                if (base64decoder.endOfStream())
+                    break;
+                addNode(collection, std::string(), node_type,
+                        node_type == FileNode::INT ? (void *) &ival : (void *) &fval, -1);
             }
-            if( base64decoder.endOfStream() )
-                break;
         }
-
-        finalizeCollection(collection);
-        return base64decoder.getPtr();
-    }
-
-    void parseError( const char* func_name, const std::string& err_msg, const char* source_file, int source_line )
-    {
-        std::string msg = format("%s(%d): %s", filename.c_str(), lineno, err_msg.c_str());
-        error(Error::StsParseError, func_name, msg.c_str(), source_file, source_line );
-    }
-
-    const uchar* getNodePtr(size_t blockIdx, size_t ofs) const
-    {
-        CV_Assert( blockIdx < fs_data_ptrs.size());
-        CV_Assert( ofs < fs_data_blksz[blockIdx]);
-
-        return fs_data_ptrs[blockIdx] + ofs;
-    }
-
-    std::string getName( size_t nameofs ) const
-    {
-        CV_Assert( nameofs < str_hash_data.size() );
-        return std::string(&str_hash_data[nameofs]);
+        if (base64decoder.endOfStream())
+            break;
     }
 
-    FileStorage* getFS() { return fs_ext; }
-
-    FileStorage* fs_ext;
-
-    std::string filename;
-    int flags;
-    bool empty_stream;
-
-    FILE* file;
-    gzFile gzfile;
+    finalizeCollection(collection);
+    return base64decoder.getPtr();
+}
 
-    bool is_opened;
-    bool dummy_eof;
-    bool write_mode;
-    bool mem_mode;
-    int fmt;
+void FileStorage::Impl::parseError(const char *func_name, const std::string &err_msg, const char *source_file,
+                                   int source_line) {
+    std::string msg = format("%s(%d): %s", filename.c_str(), lineno, err_msg.c_str());
+    error(Error::StsParseError, func_name, msg.c_str(), source_file, source_line);
+}
 
-    State state; //!< current state of the FileStorage (used only for writing)
-    int space, wrap_margin;
-    std::deque<FStructData> write_stack;
-    std::vector<char> buffer;
-    size_t bufofs;
+const uchar *FileStorage::Impl::getNodePtr(size_t blockIdx, size_t ofs) const {
+    CV_Assert(blockIdx < fs_data_ptrs.size());
+    CV_Assert(ofs < fs_data_blksz[blockIdx]);
 
-    std::deque<char> outbuf;
+    return fs_data_ptrs[blockIdx] + ofs;
+}
 
-    Ptr<FileStorageEmitter> emitter;
-    Ptr<FileStorageParser> parser;
-    Base64Decoder base64decoder;
+std::string FileStorage::Impl::getName(size_t nameofs) const {
+    CV_Assert(nameofs < str_hash_data.size());
+    return std::string(&str_hash_data[nameofs]);
+}
 
-    std::vector<FileNode> roots;
-    std::vector<Ptr<std::vector<uchar> > > fs_data;
-    std::vector<uchar*> fs_data_ptrs;
-    std::vector<size_t> fs_data_blksz;
-    size_t freeSpaceOfs;
-    typedef std::unordered_map<std::string, unsigned> str_hash_t;
-    str_hash_t str_hash;
-    std::vector<char> str_hash_data;
+FileStorage *FileStorage::Impl::getFS() { return fs_ext; }
 
-    std::vector<char> strbufv;
-    char* strbuf;
-    size_t strbufsize;
-    size_t strbufpos;
-    int lineno;
-};
 
 FileStorage::FileStorage()
     : state(0)
@@ -1800,7 +1842,7 @@ FileStorage::FileStorage(const String& filename, int flags, const String& encodi
 
 void FileStorage::startWriteStruct(const String& name, int struct_flags, const String& typeName)
 {
-    p->startWriteStruct(name.c_str(), struct_flags, typeName.c_str());
+    p->startWriteStruct(name.size() ? name.c_str() : 0, struct_flags, typeName.size() ? typeName.c_str() : 0);
     elname = String();
     if ((struct_flags & FileNode::TYPE_MASK) == FileNode::SEQ)
         state = FileStorage::VALUE_EXPECTED;
@@ -1875,7 +1917,7 @@ std::string FileStorage::getDefaultObjectName(const std::string& _filename)
     }
     ptr++;
     if( ptr == ptr2 )
-        CV_Error( CV_StsBadArg, "Invalid filename" );
+        CV_Error( cv::Error::StsBadArg, "Invalid filename" );
 
     char* name = name_buf.data();
 
@@ -1998,12 +2040,14 @@ FileStorage& operator << (FileStorage& fs, const String& str)
     if( c == '}' || c == ']' )
     {
         if( fs_impl->write_stack.empty() )
-            CV_Error_( CV_StsError, ("Extra closing '%c'", *_str) );
+            CV_Error_( cv::Error::StsError, ("Extra closing '%c'", *_str) );
+
+        fs_impl->workaround();
 
         int struct_flags = fs_impl->write_stack.back().flags;
         char expected_bracket = FileNode::isMap(struct_flags) ? '}' : ']';
         if( c != expected_bracket )
-            CV_Error_( CV_StsError, ("The closing '%c' does not match the opening '%c'", c, expected_bracket));
+            CV_Error_( cv::Error::StsError, ("The closing '%c' does not match the opening '%c'", c, expected_bracket));
         fs_impl->endWriteStruct();
         CV_Assert(!fs_impl->write_stack.empty());
         struct_flags = fs_impl->write_stack.back().flags;
@@ -2013,7 +2057,7 @@ FileStorage& operator << (FileStorage& fs, const String& str)
     else if( fs.state == NAME_EXPECTED + INSIDE_MAP )
     {
         if (!cv_isalpha(c) && c != '_')
-            CV_Error_( CV_StsError, ("Incorrect element name %s; should start with a letter or '_'", _str) );
+            CV_Error_( cv::Error::StsError, ("Incorrect element name %s; should start with a letter or '_'", _str) );
         fs.elname = str;
         fs.state = VALUE_EXPECTED + INSIDE_MAP;
     }
@@ -2042,7 +2086,7 @@ FileStorage& operator << (FileStorage& fs, const String& str)
         }
     }
     else
-        CV_Error( CV_StsError, "Invalid fs.state" );
+        CV_Error( cv::Error::StsError, "Invalid fs.state" );
     return fs;
 }
 
diff --git a/modules/core/src/persistence.hpp b/modules/core/src/persistence.hpp
index 05c7adc17ce3..1a9dbecf7c5b 100644
--- a/modules/core/src/persistence.hpp
+++ b/modules/core/src/persistence.hpp
@@ -163,6 +163,24 @@ class FileStorage_API
     CV_NORETURN
     virtual void parseError(const char* funcname, const std::string& msg,
                             const char* filename, int lineno) = 0;
+
+private:
+    enum Base64State{
+        Uncertain,
+        NotUse,
+        InUse,
+    };
+
+    friend class cv::FileStorage::Impl;
+    friend class cv::FileStorage;
+    friend class JSONEmitter;
+    friend class XMLEmitter;
+    friend class YAMLEmitter;
+
+    virtual void check_if_write_struct_is_delayed(bool change_type_to_base64 = false) = 0;
+    virtual void switch_to_Base64_state(Base64State state) = 0;
+    virtual Base64State get_state_of_writing_base64() = 0;
+    virtual int get_space() = 0;
 };
 
 class FileStorageEmitter
diff --git a/modules/core/src/persistence_base64_encoding.cpp b/modules/core/src/persistence_base64_encoding.cpp
new file mode 100644
index 000000000000..7d90fd422b2d
--- /dev/null
+++ b/modules/core/src/persistence_base64_encoding.cpp
@@ -0,0 +1,370 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include "precomp.hpp"
+#include "persistence_impl.hpp"
+#include "persistence_base64_encoding.hpp"
+
+namespace cv
+{
+
+class base64::Base64ContextEmitter
+{
+public:
+    explicit Base64ContextEmitter(cv::FileStorage::Impl& fs, bool needs_indent_)
+            : file_storage(fs)
+            , needs_indent(needs_indent_)
+            , binary_buffer(BUFFER_LEN)
+            , base64_buffer(base64_encode_buffer_size(BUFFER_LEN))
+            , src_beg(0)
+            , src_cur(0)
+            , src_end(0)
+    {
+        src_beg = binary_buffer.data();
+        src_end = src_beg + BUFFER_LEN;
+        src_cur = src_beg;
+
+        CV_Assert(fs.write_mode);
+
+        if (needs_indent)
+        {
+            file_storage.flush();
+        }
+    }
+
+    ~Base64ContextEmitter()
+    {
+        /* cleaning */
+        if (src_cur != src_beg)
+            flush();    /* encode the rest binary data to base64 buffer */
+    }
+
+    Base64ContextEmitter & write(const uchar * beg, const uchar * end)
+    {
+        if (beg >= end)
+            return *this;
+
+        while (beg < end) {
+            /* collect binary data and copy to binary buffer */
+            size_t len = std::min(end - beg, src_end - src_cur);
+            std::memcpy(src_cur, beg, len);
+            beg     += len;
+            src_cur += len;
+
+            if (src_cur >= src_end) {
+                /* binary buffer is full. */
+                /* encode it to base64 and send result to fs */
+                flush();
+            }
+        }
+
+        return *this;
+    }
+
+    /*
+     * a convertor must provide :
+     * - `operator >> (uchar * & dst)` for writing current binary data to `dst` and moving to next data.
+     * - `operator bool` for checking if current loaction is valid and not the end.
+     */
+    template<typename _to_binary_convertor_t> inline
+    Base64ContextEmitter & write(_to_binary_convertor_t & convertor)
+    {
+        static const size_t BUFFER_MAX_LEN = 1024U;
+
+        std::vector<uchar> buffer(BUFFER_MAX_LEN);
+        uchar * beg = buffer.data();
+        uchar * end = beg;
+
+        while (convertor) {
+            convertor >> end;
+            write(beg, end);
+            end = beg;
+        }
+
+        return *this;
+    }
+
+    bool flush()
+    {
+        /* control line width, so on. */
+        size_t len = base64_encode(src_beg, base64_buffer.data(), 0U, src_cur - src_beg);
+        if (len == 0U)
+            return false;
+
+        src_cur = src_beg;
+
+        if ( !needs_indent)
+        {
+            file_storage.puts((const char*)base64_buffer.data());
+        }
+        else
+        {
+            const char newline[] = "\n";
+            char space[80];
+            int ident = file_storage.write_stack.back().indent;
+            memset(space, ' ', static_cast<int>(ident));
+            space[ident] = '\0';
+
+            file_storage.puts(space);
+            file_storage.puts((const char*)base64_buffer.data());
+            file_storage.puts(newline);
+            file_storage.flush();
+        }
+
+        return true;
+    }
+
+private:
+    /* because of Base64, we must keep its length a multiple of 3 */
+    static const size_t BUFFER_LEN = 48U;
+    // static_assert(BUFFER_LEN % 3 == 0, "BUFFER_LEN is invalid");
+
+private:
+    cv::FileStorage::Impl& file_storage;
+    bool needs_indent;
+
+    std::vector<uchar> binary_buffer;
+    std::vector<uchar> base64_buffer;
+    uchar * src_beg;
+    uchar * src_cur;
+    uchar * src_end;
+};
+
+std::string base64::make_base64_header(const char *dt) {
+    std::ostringstream oss;
+    oss << dt   << ' ';
+    std::string buffer(oss.str());
+    CV_Assert(buffer.size() < ::base64::HEADER_SIZE);
+
+    buffer.reserve(::base64::HEADER_SIZE);
+    while (buffer.size() < ::base64::HEADER_SIZE)
+        buffer += ' ';
+
+    return buffer;
+}
+
+size_t base64::base64_encode(const uint8_t *src, uint8_t *dst, size_t off, size_t cnt) {
+    if (!src || !dst || !cnt)
+        return 0;
+
+    /* initialize beginning and end */
+    uint8_t       * dst_beg = dst;
+    uint8_t       * dst_cur = dst_beg;
+
+    uint8_t const * src_beg = src + off;
+    uint8_t const * src_cur = src_beg;
+    uint8_t const * src_end = src_cur + cnt / 3U * 3U;
+
+    /* integer multiples part */
+    while (src_cur < src_end) {
+        uint8_t _2 = *src_cur++;
+        uint8_t _1 = *src_cur++;
+        uint8_t _0 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_1 & 0xF0U) >> 4U | (_2 & 0x03U) << 4U];
+        *dst_cur++ = base64_mapping[(_0 & 0xC0U) >> 6U | (_1 & 0x0FU) << 2U];
+        *dst_cur++ = base64_mapping[ _0 & 0x3FU];
+    }
+
+    /* remainder part */
+    size_t rst = src_beg + cnt - src_cur;
+    if (rst == 1U) {
+        uint8_t _2 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_2 & 0x03U) << 4U];
+    } else if (rst == 2U) {
+        uint8_t _2 = *src_cur++;
+        uint8_t _1 = *src_cur++;
+        *dst_cur++ = base64_mapping[ _2          >> 2U];
+        *dst_cur++ = base64_mapping[(_2 & 0x03U) << 4U | (_1 & 0xF0U) >> 4U];
+        *dst_cur++ = base64_mapping[(_1 & 0x0FU) << 2U];
+    }
+
+    /* padding */
+    switch (rst)
+    {
+        case 1U: *dst_cur++ = base64_padding;
+            /* fallthrough */
+        case 2U: *dst_cur++ = base64_padding;
+            /* fallthrough */
+        default: *dst_cur   = 0;
+            break;
+    }
+
+    return static_cast<size_t>(dst_cur - dst_beg);
+}
+
+int base64::icvCalcStructSize(const char *dt, int initial_size) {
+    int size = cv::fs::calcElemSize( dt, initial_size );
+    size_t elem_max_size = 0;
+    for ( const char * type = dt; *type != '\0'; type++ ) {
+        switch ( *type )
+        {
+            case 'u': { elem_max_size = std::max( elem_max_size, sizeof(uchar ) ); break; }
+            case 'c': { elem_max_size = std::max( elem_max_size, sizeof(schar ) ); break; }
+            case 'w': { elem_max_size = std::max( elem_max_size, sizeof(ushort) ); break; }
+            case 's': { elem_max_size = std::max( elem_max_size, sizeof(short ) ); break; }
+            case 'i': { elem_max_size = std::max( elem_max_size, sizeof(int   ) ); break; }
+            case 'f': { elem_max_size = std::max( elem_max_size, sizeof(float ) ); break; }
+            case 'd': { elem_max_size = std::max( elem_max_size, sizeof(double) ); break; }
+            default: break;
+        }
+    }
+    size = cvAlign( size, static_cast<int>(elem_max_size) );
+    return size;
+}
+
+size_t base64::base64_encode_buffer_size(size_t cnt, bool is_end_with_zero) {
+    size_t additional = static_cast<size_t>(is_end_with_zero == true);
+    return (cnt + 2U) / 3U * 4U + additional;
+}
+
+base64::Base64Writer::Base64Writer(cv::FileStorage::Impl& fs, bool can_indent)
+        : emitter(new Base64ContextEmitter(fs, can_indent))
+        , data_type_string()
+{
+    CV_Assert(fs.write_mode);
+}
+
+void base64::Base64Writer::write(const void* _data, size_t len, const char* dt)
+{
+    check_dt(dt);
+    RawDataToBinaryConvertor convertor(_data, static_cast<int>(len), data_type_string);
+    emitter->write(convertor);
+}
+
+template<typename _to_binary_convertor_t> inline
+void base64::Base64Writer::write(_to_binary_convertor_t & convertor, const char* dt)
+{
+    check_dt(dt);
+    emitter->write(convertor);
+}
+
+base64::Base64Writer::~Base64Writer()
+{
+    delete emitter;
+}
+
+void base64::Base64Writer::check_dt(const char* dt)
+{
+    if ( dt == 0 )
+        CV_Error( cv::Error::StsBadArg, "Invalid \'dt\'." );
+    else if (data_type_string.empty()) {
+        data_type_string = dt;
+
+        /* output header */
+        std::string buffer = make_base64_header(dt);
+        const uchar * beg = reinterpret_cast<const uchar *>(buffer.data());
+        const uchar * end = beg + buffer.size();
+
+        emitter->write(beg, end);
+    } else if ( data_type_string != dt )
+        CV_Error( cv::Error::StsBadArg, "\'dt\' does not match." );
+}
+
+base64::RawDataToBinaryConvertor::RawDataToBinaryConvertor(const void* src, int len, const std::string & dt)
+        : beg(reinterpret_cast<const uchar *>(src))
+        , cur(0)
+        , end(0)
+{
+    CV_Assert(src);
+    CV_Assert(!dt.empty());
+    CV_Assert(len > 0);
+
+    /* calc step and to_binary_funcs */
+    step_packed = make_to_binary_funcs(dt);
+
+    end = beg;
+    cur = beg;
+
+    step = icvCalcStructSize(dt.c_str(), 0);
+    end = beg + static_cast<size_t>(len);
+}
+
+inline  base64::RawDataToBinaryConvertor&  base64::RawDataToBinaryConvertor::operator >>(uchar * & dst)
+{
+    CV_DbgAssert(*this);
+
+    for (size_t i = 0U, n = to_binary_funcs.size(); i < n; i++) {
+        elem_to_binary_t & pack = to_binary_funcs[i];
+        pack.func(cur + pack.offset, dst + pack.offset_packed);
+    }
+    cur += step;
+    dst += step_packed;
+
+    return *this;
+}
+
+inline  base64::RawDataToBinaryConvertor::operator bool() const
+{
+    return cur < end;
+}
+
+size_t base64::RawDataToBinaryConvertor::make_to_binary_funcs(const std::string &dt)
+{
+    size_t cnt = 0;
+    size_t offset = 0;
+    size_t offset_packed = 0;
+    char type = '\0';
+
+    std::istringstream iss(dt);
+    while (!iss.eof()) {
+        if (!(iss >> cnt)) {
+            iss.clear();
+            cnt = 1;
+        }
+        CV_Assert(cnt > 0U);
+        if (!(iss >> type))
+            break;
+
+        while (cnt-- > 0)
+        {
+            elem_to_binary_t pack;
+
+            size_t size = 0;
+            switch (type)
+            {
+                case 'u':
+                case 'c':
+                    size = sizeof(uchar);
+                    pack.func = to_binary<uchar>;
+                    break;
+                case 'w':
+                case 's':
+                    size = sizeof(ushort);
+                    pack.func = to_binary<ushort>;
+                    break;
+                case 'i':
+                    size = sizeof(uint);
+                    pack.func = to_binary<uint>;
+                    break;
+                case 'f':
+                    size = sizeof(float);
+                    pack.func = to_binary<float>;
+                    break;
+                case 'd':
+                    size = sizeof(double);
+                    pack.func = to_binary<double>;
+                    break;
+                case 'r':
+                default:
+                    CV_Error(cv::Error::StsError, "type is not supported");
+            };
+
+            offset = static_cast<size_t>(cvAlign(static_cast<int>(offset), static_cast<int>(size)));
+            pack.offset = offset;
+            offset += size;
+
+            pack.offset_packed = offset_packed;
+            offset_packed += size;
+
+            to_binary_funcs.push_back(pack);
+        }
+    }
+
+    CV_Assert(iss.eof());
+    return offset_packed;
+}
+
+}
\ No newline at end of file
diff --git a/modules/core/src/persistence_base64_encoding.hpp b/modules/core/src/persistence_base64_encoding.hpp
new file mode 100644
index 000000000000..1ee5201e141f
--- /dev/null
+++ b/modules/core/src/persistence_base64_encoding.hpp
@@ -0,0 +1,127 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_CORE_BASE64_ENCODING_HPP
+#define OPENCV_CORE_BASE64_ENCODING_HPP
+
+namespace cv
+{
+
+namespace base64
+{
+/* A decorator for CvFileStorage
+* - no copyable
+* - not safe for now
+* - move constructor may be needed if C++11
+*/
+uint8_t const base64_mapping[] =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+        "abcdefghijklmnopqrstuvwxyz"
+        "0123456789+/";
+
+uint8_t const base64_padding = '=';
+
+std::string make_base64_header(const char * dt);
+
+size_t base64_encode(uint8_t const * src, uint8_t * dst, size_t off, size_t cnt);
+
+
+int icvCalcStructSize( const char* dt, int initial_size );
+
+class Base64ContextEmitter;
+class Impl;
+
+class Base64Writer
+{
+public:
+    Base64Writer(cv::FileStorage::Impl& fs, bool can_indent);
+    ~Base64Writer();
+    void write(const void* _data, size_t len, const char* dt);
+    template<typename _to_binary_convertor_t> void write(_to_binary_convertor_t & convertor, const char* dt);
+
+private:
+    void check_dt(const char* dt);
+
+private:
+    // disable copy and assignment
+    Base64Writer(const Base64Writer &);
+    Base64Writer & operator=(const Base64Writer &);
+
+private:
+
+    Base64ContextEmitter * emitter;
+    std::string data_type_string;
+};
+
+size_t base64_encode_buffer_size(size_t cnt, bool is_end_with_zero = true);
+
+template<typename _uint_t> inline size_t
+to_binary(_uint_t val, uchar * cur)
+{
+    size_t delta = CHAR_BIT;
+    size_t cnt = sizeof(_uint_t);
+    while (cnt --> static_cast<size_t>(0U)) {
+        *cur++ = static_cast<uchar>(val);
+        val >>= delta;
+    }
+    return sizeof(_uint_t);
+}
+
+template<> inline size_t to_binary(double val, uchar * cur)
+{
+    Cv64suf bit64;
+    bit64.f = val;
+    return to_binary(bit64.u, cur);
+}
+
+template<> inline size_t to_binary(float val, uchar * cur)
+{
+    Cv32suf bit32;
+    bit32.f = val;
+    return to_binary(bit32.u, cur);
+}
+
+template<typename _primitive_t> inline size_t
+to_binary(uchar const * val, uchar * cur)
+{
+    return to_binary<_primitive_t>(*reinterpret_cast<_primitive_t const *>(val), cur);
+}
+
+
+
+class RawDataToBinaryConvertor
+{
+public:
+    // NOTE: len is already multiplied by element size here
+    RawDataToBinaryConvertor(const void* src, int len, const std::string & dt);
+
+    inline RawDataToBinaryConvertor & operator >>(uchar * & dst);
+    inline operator bool() const;
+
+private:
+    typedef size_t(*to_binary_t)(const uchar *, uchar *);
+    struct elem_to_binary_t
+    {
+        size_t      offset;
+        size_t      offset_packed;
+        to_binary_t func;
+    };
+
+private:
+    size_t make_to_binary_funcs(const std::string &dt);
+
+private:
+    const uchar * beg;
+    const uchar * cur;
+    const uchar * end;
+
+    size_t step;
+    size_t step_packed;
+    std::vector<elem_to_binary_t> to_binary_funcs;
+};
+
+}
+
+}
+#endif
\ No newline at end of file
diff --git a/modules/core/src/persistence_impl.hpp b/modules/core/src/persistence_impl.hpp
new file mode 100644
index 000000000000..4ea2dc350282
--- /dev/null
+++ b/modules/core/src/persistence_impl.hpp
@@ -0,0 +1,231 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_CORE_PERSISTENCE_IMPL_HPP
+#define OPENCV_CORE_PERSISTENCE_IMPL_HPP
+
+#include "persistence.hpp"
+#include "persistence_base64_encoding.hpp"
+#include <unordered_map>
+#include <iterator>
+
+
+namespace cv
+{
+
+enum Base64State{
+    Uncertain,
+    NotUse,
+    InUse,
+};
+
+class cv::FileStorage::Impl : public FileStorage_API
+{
+public:
+    void init();
+
+    Impl(FileStorage* _fs);
+
+    virtual ~Impl();
+
+    void release(String* out=0);
+
+    void analyze_file_name( const std::string& file_name, std::vector<std::string>& params );
+
+    bool open( const char* filename_or_buf, int _flags, const char* encoding );
+
+    void puts( const char* str );
+
+    char* getsFromFile( char* buf, int count );
+
+    char* gets( size_t maxCount );
+
+    char* gets();
+
+    bool eof();
+
+    void setEof();
+
+    void closeFile();
+
+    void rewind();
+
+    char* resizeWriteBuffer( char* ptr, int len );
+
+    char* flush();
+
+    void endWriteStruct();
+
+    void startWriteStruct_helper( const char* key, int struct_flags,
+                                  const char* type_name );
+
+    void startWriteStruct( const char* key, int struct_flags,
+                           const char* type_name );
+
+    void writeComment( const char* comment, bool eol_comment );
+
+    void startNextStream();
+
+    void write( const String& key, int value );
+
+    void write( const String& key, double value );
+
+    void write( const String& key, const String& value );
+
+    void writeRawData( const std::string& dt, const void* _data, size_t len );
+
+    void workaround();
+
+    void switch_to_Base64_state( FileStorage_API::Base64State new_state);
+
+    void make_write_struct_delayed( const char* key, int struct_flags, const char* type_name );
+
+    void check_if_write_struct_is_delayed( bool change_type_to_base64 );
+
+    void writeRawDataBase64(const void* _data, size_t len, const char* dt );
+
+    String releaseAndGetString();
+
+    FileNode getFirstTopLevelNode() const;
+
+    FileNode root(int streamIdx=0) const;
+
+    FileNode operator[](const String& nodename) const;
+
+    FileNode operator[](const char* /*nodename*/) const;
+
+    int getFormat() const;
+
+    char* bufferPtr() const;
+    char* bufferStart() const;
+    char* bufferEnd() const;
+    void setBufferPtr(char* ptr);
+    int wrapMargin() const;
+
+    FStructData& getCurrentStruct();
+
+    void setNonEmpty();
+
+    void processSpecialDouble( char* buf, double* value, char** endptr );
+
+    double strtod( char* ptr, char** endptr );
+
+    void convertToCollection(int type, FileNode& node);
+
+    // a) allocates new FileNode (for that just set blockIdx to the last block and ofs to freeSpaceOfs) or
+    // b) reallocates just created new node (blockIdx and ofs must be taken from FileNode).
+    //    If there is no enough space in the current block (it should be the last block added so far),
+    //    the last block is shrunk so that it ends immediately before the reallocated node. Then,
+    //    a new block of sufficient size is allocated and the FileNode is placed in the beginning of it.
+    // The case (a) can be used to allocate the very first node by setting blockIdx == ofs == 0.
+    // In the case (b) the existing tag and the name are copied automatically.
+    uchar* reserveNodeSpace(FileNode& node, size_t sz);
+
+    unsigned getStringOfs( const std::string& key ) const;
+
+    FileNode addNode( FileNode& collection, const std::string& key,
+                      int elem_type, const void* value, int len );
+
+    void finalizeCollection( FileNode& collection );
+
+    void normalizeNodeOfs(size_t& blockIdx, size_t& ofs) const;
+
+    Base64State get_state_of_writing_base64();
+
+    int get_space();
+
+    class Base64Decoder
+    {
+    public:
+        Base64Decoder();
+        void init(Ptr<FileStorageParser>& _parser, char* _ptr, int _indent);
+
+        bool readMore(int needed);
+
+        uchar getUInt8();
+
+        ushort getUInt16();
+
+        int getInt32();
+
+        double getFloat64();
+
+        bool endOfStream() const;
+        char* getPtr() const;
+    protected:
+
+        Ptr<FileStorageParser> parser;
+        char* ptr;
+        int indent;
+        std::vector<char> encoded;
+        std::vector<uchar> decoded;
+        size_t ofs;
+        size_t totalchars;
+        bool eos;
+    };
+
+    char* parseBase64(char* ptr, int indent, FileNode& collection);
+
+    void parseError( const char* func_name, const std::string& err_msg, const char* source_file, int source_line );
+
+    const uchar* getNodePtr(size_t blockIdx, size_t ofs) const;
+
+    std::string getName( size_t nameofs ) const;
+
+    FileStorage* getFS();
+
+    FileStorage* fs_ext;
+
+    std::string filename;
+    int flags;
+    bool empty_stream;
+
+    FILE* file;
+    gzFile gzfile;
+
+    bool is_opened;
+    bool dummy_eof;
+    bool write_mode;
+    bool mem_mode;
+    int fmt;
+
+    State state; //!< current state of the FileStorage (used only for writing)
+    bool is_using_base64;
+    bool is_write_struct_delayed;
+    char* delayed_struct_key;
+    int   delayed_struct_flags;
+    char* delayed_type_name;
+    FileStorage_API::Base64State state_of_writing_base64;
+
+    int space, wrap_margin;
+    std::deque<FStructData> write_stack;
+    std::vector<char> buffer;
+    size_t bufofs;
+
+    std::deque<char> outbuf;
+
+    Ptr<FileStorageEmitter> emitter;
+    Ptr<FileStorageParser> parser;
+    Base64Decoder base64decoder;
+    base64::Base64Writer* base64_writer;
+
+    std::vector<FileNode> roots;
+    std::vector<Ptr<std::vector<uchar> > > fs_data;
+    std::vector<uchar*> fs_data_ptrs;
+    std::vector<size_t> fs_data_blksz;
+    size_t freeSpaceOfs;
+    typedef std::unordered_map<std::string, unsigned> str_hash_t;
+    str_hash_t str_hash;
+    std::vector<char> str_hash_data;
+
+    std::vector<char> strbufv;
+    char* strbuf;
+    size_t strbufsize;
+    size_t strbufpos;
+    int lineno;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/modules/core/src/persistence_json.cpp b/modules/core/src/persistence_json.cpp
index 667895fbc5e7..12a58e80bfa0 100644
--- a/modules/core/src/persistence_json.cpp
+++ b/modules/core/src/persistence_json.cpp
@@ -23,7 +23,7 @@ class JSONEmitter : public FileStorageEmitter
 
         struct_flags = (struct_flags & (FileNode::TYPE_MASK|FileNode::FLOW)) | FileNode::EMPTY;
         if( !FileNode::isCollection(struct_flags))
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "Some collection type - FileNode::SEQ or FileNode::MAP, must be specified" );
 
         if( type_name && *type_name == '\0' )
@@ -53,29 +53,26 @@ class JSONEmitter : public FileStorageEmitter
     void endWriteStruct(const FStructData& current_struct)
     {
         int struct_flags = current_struct.flags;
-        CV_Assert( FileNode::isCollection(struct_flags) );
 
-        if( !FileNode::isFlow(struct_flags) )
-        {
-#if 0
-            if ( fs->bufferPtr() <= fs->bufferStart() + fs->space )
-            {
-                /* some bad code for base64_writer... */
-                ptr = fs->bufferPtr();
-                *ptr++ = '\n';
-                *ptr++ = '\0';
-                fs->puts( fs->bufferStart() );
-                fs->setBufferPtr(fs->bufferStart());
+        if (FileNode::isCollection(struct_flags)) {
+            if (!FileNode::isFlow(struct_flags)) {
+                if (fs->bufferPtr() <= fs->bufferStart() + fs->get_space()) {
+                    /* some bad code for base64_writer... */
+                    char *ptr = fs->bufferPtr();
+                    *ptr++ = '\n';
+                    *ptr++ = '\0';
+                    fs->puts(fs->bufferStart());
+                    fs->setBufferPtr(fs->bufferStart());
+                }
+                fs->flush();
             }
-#endif
-            fs->flush();
-        }
 
-        char* ptr = fs->bufferPtr();
-        if( ptr > fs->bufferStart() + current_struct.indent && !FileNode::isEmptyCollection(struct_flags) )
-            *ptr++ = ' ';
-        *ptr++ = FileNode::isMap(struct_flags) ? '}' : ']';
-        fs->setBufferPtr(ptr);
+            char *ptr = fs->bufferPtr();
+            if (ptr > fs->bufferStart() + current_struct.indent && !FileNode::isEmptyCollection(struct_flags))
+                *ptr++ = ' ';
+            *ptr++ = FileNode::isMap(struct_flags) ? '}' : ']';
+            fs->setBufferPtr(ptr);
+        }
     }
 
     void write(const char* key, int value)
@@ -97,11 +94,11 @@ class JSONEmitter : public FileStorageEmitter
         int i, len;
 
         if( !str )
-            CV_Error( CV_StsNullPtr, "Null string pointer" );
+            CV_Error( cv::Error::StsNullPtr, "Null string pointer" );
 
         len = (int)strlen(str);
         if( len > CV_FS_MAX_LEN )
-            CV_Error( CV_StsBadArg, "The written string is too long" );
+            CV_Error( cv::Error::StsBadArg, "The written string is too long" );
 
         if( quote || len == 0 || str[0] != str[len-1] || (str[0] != '\"' && str[0] != '\'') )
         {
@@ -136,6 +133,20 @@ class JSONEmitter : public FileStorageEmitter
 
     void writeScalar(const char* key, const char* data)
     {
+        /* check write_struct */
+
+        fs->check_if_write_struct_is_delayed(false);
+        if ( fs->get_state_of_writing_base64() == FileStorage_API::Uncertain )
+        {
+            fs->switch_to_Base64_state( FileStorage_API::NotUse );
+        }
+        else if ( fs->get_state_of_writing_base64() == FileStorage_API::InUse )
+        {
+            CV_Error( cv::Error::StsError, "At present, output Base64 data only." );
+        }
+
+        /* check parameters */
+
         size_t key_len = 0u;
         if( key && *key == '\0' )
             key = 0;
@@ -143,9 +154,9 @@ class JSONEmitter : public FileStorageEmitter
         {
             key_len = strlen(key);
             if ( key_len == 0u )
-                CV_Error( CV_StsBadArg, "The key is an empty" );
+                CV_Error( cv::Error::StsBadArg, "The key is an empty" );
             else if ( static_cast<int>(key_len) > CV_FS_MAX_LEN )
-                CV_Error( CV_StsBadArg, "The key is too long" );
+                CV_Error( cv::Error::StsBadArg, "The key is too long" );
         }
 
         size_t data_len = 0u;
@@ -157,7 +168,7 @@ class JSONEmitter : public FileStorageEmitter
         if( FileNode::isCollection(struct_flags) )
         {
             if ( (FileNode::isMap(struct_flags) ^ (key != 0)) )
-                CV_Error( CV_StsBadArg, "An attempt to add element without a key to a map, "
+                CV_Error( cv::Error::StsBadArg, "An attempt to add element without a key to a map, "
                          "or add element with key to sequence" );
         } else {
             fs->setNonEmpty();
@@ -199,7 +210,7 @@ class JSONEmitter : public FileStorageEmitter
         if( key )
         {
             if( !cv_isalpha(key[0]) && key[0] != '_' )
-                CV_Error( CV_StsBadArg, "Key must start with a letter or _" );
+                CV_Error( cv::Error::StsBadArg, "Key must start with a letter or _" );
 
             ptr = fs->resizeWriteBuffer( ptr, static_cast<int>(key_len) );
             *ptr++ = '\"';
@@ -210,7 +221,7 @@ class JSONEmitter : public FileStorageEmitter
 
                 ptr[i] = c;
                 if( !cv_isalnum(c) && c != '-' && c != '_' && c != ' ' )
-                    CV_Error( CV_StsBadArg, "Key names may only contain alphanumeric characters [a-zA-Z0-9], '-', '_' and ' '" );
+                    CV_Error( cv::Error::StsBadArg, "Key names may only contain alphanumeric characters [a-zA-Z0-9], '-', '_' and ' '" );
             }
 
             ptr += key_len;
@@ -233,7 +244,7 @@ class JSONEmitter : public FileStorageEmitter
     void writeComment(const char* comment, bool eol_comment)
     {
         if( !comment )
-            CV_Error( CV_StsNullPtr, "Null comment" );
+            CV_Error( cv::Error::StsNullPtr, "Null comment" );
 
         int len = static_cast<int>(strlen(comment));
         char* ptr = fs->bufferPtr();
diff --git a/modules/core/src/persistence_xml.cpp b/modules/core/src/persistence_xml.cpp
index 52b53744254e..62b7b1eb59c6 100644
--- a/modules/core/src/persistence_xml.cpp
+++ b/modules/core/src/persistence_xml.cpp
@@ -45,7 +45,7 @@ class XMLEmitter : public FileStorageEmitter
             if( FileNode::isCollection(struct_flags) )
             {
                 if( FileNode::isMap(struct_flags) ^ (key != 0) )
-                    CV_Error( CV_StsBadArg, "An attempt to add element without a key to a map, "
+                    CV_Error( cv::Error::StsBadArg, "An attempt to add element without a key to a map, "
                              "or add element with key to sequence" );
             }
             else
@@ -61,26 +61,26 @@ class XMLEmitter : public FileStorageEmitter
         if( !key )
             key = "_";
         else if( key[0] == '_' && key[1] == '\0' )
-            CV_Error( CV_StsBadArg, "A single _ is a reserved tag name" );
+            CV_Error( cv::Error::StsBadArg, "A single _ is a reserved tag name" );
 
         len = (int)strlen( key );
         *ptr++ = '<';
         if( tag_type == CV_XML_CLOSING_TAG )
         {
             if( !attrlist.empty() )
-                CV_Error( CV_StsBadArg, "Closing tag should not include any attributes" );
+                CV_Error( cv::Error::StsBadArg, "Closing tag should not include any attributes" );
             *ptr++ = '/';
         }
 
         if( !cv_isalpha(key[0]) && key[0] != '_' )
-            CV_Error( CV_StsBadArg, "Key should start with a letter or _" );
+            CV_Error( cv::Error::StsBadArg, "Key should start with a letter or _" );
 
         ptr = fs->resizeWriteBuffer( ptr, len );
         for( i = 0; i < len; i++ )
         {
             char c = key[i];
             if( !cv_isalnum(c) && c != '_' && c != '-' )
-                CV_Error( CV_StsBadArg, "Key name may only contain alphanumeric characters [a-zA-Z0-9], '-' and '_'" );
+                CV_Error( cv::Error::StsBadArg, "Key name may only contain alphanumeric characters [a-zA-Z0-9], '-' and '_'" );
             ptr[i] = c;
         }
         ptr += len;
@@ -158,11 +158,11 @@ class XMLEmitter : public FileStorageEmitter
         int i, len;
 
         if( !str )
-            CV_Error( CV_StsNullPtr, "Null string pointer" );
+            CV_Error( cv::Error::StsNullPtr, "Null string pointer" );
 
         len = (int)strlen(str);
         if( len > CV_FS_MAX_LEN )
-            CV_Error( CV_StsBadArg, "The written string is too long" );
+            CV_Error( cv::Error::StsBadArg, "The written string is too long" );
 
         if( quote || len == 0 || str[0] != '\"' || str[0] != str[len-1] )
         {
@@ -233,6 +233,16 @@ class XMLEmitter : public FileStorageEmitter
 
     void writeScalar(const char* key, const char* data)
     {
+        fs->check_if_write_struct_is_delayed(false);
+        if ( fs->get_state_of_writing_base64() == FileStorage_API::Uncertain )
+        {
+            fs->switch_to_Base64_state( FileStorage_API::NotUse );
+        }
+        else if ( fs->get_state_of_writing_base64() == FileStorage_API::InUse )
+        {
+            CV_Error( cv::Error::StsError, "At present, output Base64 data only." );
+        }
+
         int len = (int)strlen(data);
         if( key && *key == '\0' )
             key = 0;
@@ -255,7 +265,7 @@ class XMLEmitter : public FileStorageEmitter
             int new_offset = (int)(ptr - fs->bufferStart()) + len;
 
             if( key )
-                CV_Error( CV_StsBadArg, "elements with keys can not be written to sequence" );
+                CV_Error( cv::Error::StsBadArg, "elements with keys can not be written to sequence" );
 
             current_struct.flags = FileNode::SEQ;
 
@@ -281,10 +291,10 @@ class XMLEmitter : public FileStorageEmitter
         char* ptr;
 
         if( !comment )
-            CV_Error( CV_StsNullPtr, "Null comment" );
+            CV_Error( cv::Error::StsNullPtr, "Null comment" );
 
         if( strstr(comment, "--") != 0 )
-            CV_Error( CV_StsBadArg, "Double hyphen \'--\' is not allowed in the comments" );
+            CV_Error( cv::Error::StsBadArg, "Double hyphen \'--\' is not allowed in the comments" );
 
         len = (int)strlen(comment);
         eol = strchr(comment, '\n');
diff --git a/modules/core/src/persistence_yml.cpp b/modules/core/src/persistence_yml.cpp
index 3f3742b8d18e..95db1450c62e 100644
--- a/modules/core/src/persistence_yml.cpp
+++ b/modules/core/src/persistence_yml.cpp
@@ -33,7 +33,7 @@ class YAMLEmitter : public FileStorageEmitter
 
         struct_flags = (struct_flags & (FileNode::TYPE_MASK|FileNode::FLOW)) | FileNode::EMPTY;
         if( !FileNode::isCollection(struct_flags))
-            CV_Error( CV_StsBadArg,
+            CV_Error( cv::Error::StsBadArg,
                      "Some collection type - FileNode::SEQ or FileNode::MAP, must be specified" );
 
         if (type_name && memcmp(type_name, "binary", 6) == 0)
@@ -120,11 +120,11 @@ class YAMLEmitter : public FileStorageEmitter
         int i, len;
 
         if( !str )
-            CV_Error( CV_StsNullPtr, "Null string pointer" );
+            CV_Error( cv::Error::StsNullPtr, "Null string pointer" );
 
         len = (int)strlen(str);
         if( len > CV_FS_MAX_LEN )
-            CV_Error( CV_StsBadArg, "The written string is too long" );
+            CV_Error( cv::Error::StsBadArg, "The written string is too long" );
 
         if( quote || len == 0 || str[0] != str[len-1] || (str[0] != '\"' && str[0] != '\'') )
         {
@@ -174,6 +174,16 @@ class YAMLEmitter : public FileStorageEmitter
 
     void writeScalar(const char* key, const char* data)
     {
+        fs->check_if_write_struct_is_delayed(false);
+        if ( fs->get_state_of_writing_base64() == FileStorage_API::Uncertain )
+        {
+            fs->switch_to_Base64_state( FileStorage_API::NotUse );
+        }
+        else if ( fs->get_state_of_writing_base64() == FileStorage_API::InUse )
+        {
+            CV_Error( cv::Error::StsError, "At present, output Base64 data only." );
+        }
+
         int i, keylen = 0;
         int datalen = 0;
         char* ptr;
@@ -188,7 +198,7 @@ class YAMLEmitter : public FileStorageEmitter
         if( FileNode::isCollection(struct_flags) )
         {
             if( (FileNode::isMap(struct_flags) ^ (key != 0)) )
-                CV_Error( CV_StsBadArg, "An attempt to add element without a key to a map, "
+                CV_Error( cv::Error::StsBadArg, "An attempt to add element without a key to a map, "
                          "or add element with key to sequence" );
         }
         else
@@ -201,10 +211,10 @@ class YAMLEmitter : public FileStorageEmitter
         {
             keylen = (int)strlen(key);
             if( keylen == 0 )
-                CV_Error( CV_StsBadArg, "The key is an empty" );
+                CV_Error( cv::Error::StsBadArg, "The key is an empty" );
 
             if( keylen > CV_FS_MAX_LEN )
-                CV_Error( CV_StsBadArg, "The key is too long" );
+                CV_Error( cv::Error::StsBadArg, "The key is too long" );
         }
 
         if( data )
@@ -238,7 +248,7 @@ class YAMLEmitter : public FileStorageEmitter
         if( key )
         {
             if( !cv_isalpha(key[0]) && key[0] != '_' )
-                CV_Error( CV_StsBadArg, "Key must start with a letter or _" );
+                CV_Error( cv::Error::StsBadArg, "Key must start with a letter or _" );
 
             ptr = fs->resizeWriteBuffer( ptr, keylen );
 
@@ -248,7 +258,7 @@ class YAMLEmitter : public FileStorageEmitter
 
                 ptr[i] = c;
                 if( !cv_isalnum(c) && c != '-' && c != '_' && c != ' ' )
-                    CV_Error( CV_StsBadArg, "Key names may only contain alphanumeric characters [a-zA-Z0-9], '-', '_' and ' '" );
+                    CV_Error( cv::Error::StsBadArg, "Key names may only contain alphanumeric characters [a-zA-Z0-9], '-', '_' and ' '" );
             }
 
             ptr += keylen;
@@ -271,7 +281,7 @@ class YAMLEmitter : public FileStorageEmitter
     void writeComment(const char* comment, bool eol_comment)
     {
         if( !comment )
-            CV_Error( CV_StsNullPtr, "Null comment" );
+            CV_Error( cv::Error::StsNullPtr, "Null comment" );
 
         int len = (int)strlen(comment);
         const char* eol = strchr(comment, '\n');
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index af4a62181697..ccb962e4ee64 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -116,10 +116,14 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #include <cstdlib>        // std::abort
 #endif
 
-#if defined __ANDROID__ || defined __linux__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__
+#if defined __ANDROID__ || defined __unix__ || defined __FreeBSD__ || defined __OpenBSD__ || defined __HAIKU__ || defined __Fuchsia__
 #  include <unistd.h>
 #  include <fcntl.h>
+#if defined __QNXNTO__
+#  include <sys/elf.h>
+#else
 #  include <elf.h>
+#endif
 #if defined __ANDROID__ || defined __linux__
 #  include <linux/auxvec.h>
 #endif
@@ -130,7 +134,7 @@ void* allocSingletonNewBuffer(size_t size) { return malloc(size); }
 #endif
 
 
-#if (defined __ppc64__ || defined __PPC64__) && defined __linux__
+#if (defined __ppc64__ || defined __PPC64__) && defined __unix__
 # include "sys/auxv.h"
 # ifndef AT_HWCAP2
 #   define AT_HWCAP2 26
@@ -216,7 +220,9 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 
 #endif
 #else
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
 #include <pthread.h>
+#endif
 #include <sys/time.h>
 #include <time.h>
 
@@ -231,7 +237,7 @@ std::wstring GetTempFileNameWinRT(std::wstring prefix)
 #include "omp.h"
 #endif
 
-#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__
+#if defined __unix__ || defined __APPLE__ || defined __EMSCRIPTEN__ || defined __FreeBSD__ || defined __GLIBC__ || defined __HAIKU__
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/types.h>
@@ -598,7 +604,7 @@ struct HWFeatures
         have[CV_CPU_MSA] = true;
     #endif
 
-    #if (defined __ppc64__ || defined __PPC64__) && defined __linux__
+    #if (defined __ppc64__ || defined __PPC64__) && defined __unix__
         unsigned int hwcap = getauxval(AT_HWCAP);
         if (hwcap & PPC_FEATURE_HAS_VSX) {
             hwcap = getauxval(AT_HWCAP2);
@@ -812,12 +818,12 @@ int64 getTickCount(void)
     LARGE_INTEGER counter;
     QueryPerformanceCounter( &counter );
     return (int64)counter.QuadPart;
-#elif defined __linux || defined __linux__
+#elif defined __MACH__ && defined __APPLE__
+    return (int64)mach_absolute_time();
+#elif defined __unix__
     struct timespec tp;
     clock_gettime(CLOCK_MONOTONIC, &tp);
     return (int64)tp.tv_sec*1000000000 + tp.tv_nsec;
-#elif defined __MACH__ && defined __APPLE__
-    return (int64)mach_absolute_time();
 #else
     struct timeval tv;
     gettimeofday(&tv, NULL);
@@ -831,8 +837,6 @@ double getTickFrequency(void)
     LARGE_INTEGER freq;
     QueryPerformanceFrequency(&freq);
     return (double)freq.QuadPart;
-#elif defined __linux || defined __linux__
-    return 1e9;
 #elif defined __MACH__ && defined __APPLE__
     static double freq = 0;
     if( freq == 0 )
@@ -842,6 +846,8 @@ double getTickFrequency(void)
         freq = sTimebaseInfo.denom*1e9/sTimebaseInfo.numer;
     }
     return freq;
+#elif defined __unix__
+    return 1e9;
 #else
     return 1e6;
 #endif
@@ -1366,6 +1372,8 @@ bool __termination = false;
 
 namespace details {
 
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
+
 #ifdef _WIN32
 #ifdef _MSC_VER
 #pragma warning(disable:4505) // unreferenced local function has been removed
@@ -1778,14 +1786,122 @@ static void WINAPI opencv_fls_destructor(void* pData)
 #endif // CV_USE_FLS
 #endif // _WIN32
 
+#else  // OPENCV_DISABLE_THREAD_SUPPORT
+
+// no threading (OPENCV_DISABLE_THREAD_SUPPORT=ON)
+class TlsStorage
+{
+public:
+    TlsStorage()
+    {
+        slots.reserve(32);
+    }
+    ~TlsStorage()
+    {
+        for (size_t slotIdx = 0; slotIdx < slots.size(); slotIdx++)
+        {
+            SlotInfo& s = slots[slotIdx];
+            TLSDataContainer* container = s.container;
+            if (container && s.data)
+            {
+                container->deleteDataInstance(s.data);  // Can't use from SlotInfo destructor
+                s.data = nullptr;
+            }
+        }
+    }
+
+    // Reserve TLS storage index
+    size_t reserveSlot(TLSDataContainer* container)
+    {
+        size_t slotsSize = slots.size();
+        for (size_t slot = 0; slot < slotsSize; slot++)
+        {
+            SlotInfo& s = slots[slot];
+            if (s.container == NULL)
+            {
+                CV_Assert(!s.data);
+                s.container = container;
+                return slot;
+            }
+        }
+
+        // create new slot
+        slots.push_back(SlotInfo(container));
+        return slotsSize;
+    }
+
+    // Release TLS storage index and pass associated data to caller
+    void releaseSlot(size_t slotIdx, std::vector<void*> &dataVec, bool keepSlot = false)
+    {
+        CV_Assert(slotIdx < slots.size());
+        SlotInfo& s = slots[slotIdx];
+        void* data = s.data;
+        if (data)
+        {
+            dataVec.push_back(data);
+            s.data = nullptr;
+        }
+        if (!keepSlot)
+        {
+            s.container = NULL;  // mark slot as free (see reserveSlot() implementation)
+        }
+    }
+
+    // Get data by TLS storage index
+    void* getData(size_t slotIdx) const
+    {
+        CV_Assert(slotIdx < slots.size());
+        const SlotInfo& s = slots[slotIdx];
+        return s.data;
+    }
+
+    // Gather data from threads by TLS storage index
+    void gather(size_t slotIdx, std::vector<void*> &dataVec)
+    {
+        CV_Assert(slotIdx < slots.size());
+        SlotInfo& s = slots[slotIdx];
+        void* data = s.data;
+        if (data)
+            dataVec.push_back(data);
+        return;
+    }
+
+    // Set data to storage index
+    void setData(size_t slotIdx, void* pData)
+    {
+        CV_Assert(slotIdx < slots.size());
+        SlotInfo& s = slots[slotIdx];
+        s.data = pData;
+    }
+
+private:
+    struct SlotInfo
+    {
+        SlotInfo(TLSDataContainer* _container) : container(_container), data(nullptr) {}
+        TLSDataContainer* container;  // attached container (to dispose data)
+        void* data;
+    };
+    std::vector<struct SlotInfo> slots;
+};
+
+static TlsStorage& getTlsStorage()
+{
+    static TlsStorage g_storage;  // no threading
+    return g_storage;
+}
+
+#endif  // OPENCV_DISABLE_THREAD_SUPPORT
+
 } // namespace details
 using namespace details;
 
 void releaseTlsStorageThread()
 {
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
     if (!g_isTlsStorageInitialized)
         return;  // nothing to release, so prefer to avoid creation of new global structures
     getTlsStorage().releaseThread();
+#endif
 }
 
 TLSDataContainer::TLSDataContainer()
@@ -1835,7 +1951,15 @@ void* TLSDataContainer::getData() const
     {
         // Create new data instance and save it to TLS storage
         pData = createDataInstance();
-        getTlsStorage().setData(key_, pData);
+        try
+        {
+            getTlsStorage().setData(key_, pData);
+        }
+        catch (...)
+        {
+            deleteDataInstance(pData);
+            throw;
+        }
     }
     return pData;
 }
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
index c80d240ecc02..bbb34a725604 100644
--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -56,10 +56,6 @@ void setSize(UMat& m, int _dims, const int* _sz, const size_t* _steps,
 void updateContinuityFlag(UMat& m);
 void finalizeHdr(UMat& m);
 
-// it should be a prime number for the best hash function
-enum { UMAT_NLOCKS = 31 };
-static Mutex umatLocks[UMAT_NLOCKS];
-
 UMatData::UMatData(const MatAllocator* allocator)
 {
     prevAllocator = currAllocator = allocator;
@@ -131,6 +127,12 @@ UMatData::~UMatData()
     }
 }
 
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
+
+// it should be a prime number for the best hash function
+enum { UMAT_NLOCKS = 31 };
+static Mutex umatLocks[UMAT_NLOCKS];
+
 static size_t getUMatDataLockIndex(const UMatData* u)
 {
     size_t idx = ((size_t)(void*)u) % UMAT_NLOCKS;
@@ -228,6 +230,33 @@ UMatDataAutoLock::~UMatDataAutoLock()
     getUMatDataAutoLocker().release(u1, u2);
 }
 
+#else
+
+void UMatData::lock()
+{
+    // nothing in OPENCV_DISABLE_THREAD_SUPPORT mode
+}
+
+void UMatData::unlock()
+{
+    // nothing in OPENCV_DISABLE_THREAD_SUPPORT mode
+}
+
+UMatDataAutoLock::UMatDataAutoLock(UMatData* u) : u1(u), u2(NULL)
+{
+    // nothing in OPENCV_DISABLE_THREAD_SUPPORT mode
+}
+UMatDataAutoLock::UMatDataAutoLock(UMatData* u1_, UMatData* u2_) : u1(u1_), u2(u2_)
+{
+    // nothing in OPENCV_DISABLE_THREAD_SUPPORT mode
+}
+UMatDataAutoLock::~UMatDataAutoLock()
+{
+    // nothing in OPENCV_DISABLE_THREAD_SUPPORT mode
+}
+
+#endif  // OPENCV_DISABLE_THREAD_SUPPORT
+
 //////////////////////////////// UMat ////////////////////////////////
 
 UMat::UMat(UMatUsageFlags _usageFlags) CV_NOEXCEPT
@@ -951,11 +980,11 @@ UMat UMat::reshape(int new_cn, int new_rows) const
     return hdr;
 }
 
-UMat UMat::diag(const UMat& d)
+UMat UMat::diag(const UMat& d, UMatUsageFlags usageFlags)
 {
     CV_Assert( d.cols == 1 || d.rows == 1 );
     int len = d.rows + d.cols - 1;
-    UMat m(len, len, d.type(), Scalar(0));
+    UMat m(len, len, d.type(), Scalar(0), usageFlags);
     UMat md = m.diag();
     if( d.cols == 1 )
         d.copyTo(md);
@@ -1323,34 +1352,34 @@ UMat UMat::t() const
     return m;
 }
 
-UMat UMat::zeros(int rows, int cols, int type)
+UMat UMat::zeros(int rows, int cols, int type, UMatUsageFlags usageFlags)
 {
-    return UMat(rows, cols, type, Scalar::all(0));
+    return UMat(rows, cols, type, Scalar::all(0), usageFlags);
 }
 
-UMat UMat::zeros(Size size, int type)
+UMat UMat::zeros(Size size, int type, UMatUsageFlags usageFlags)
 {
-    return UMat(size, type, Scalar::all(0));
+    return UMat(size, type, Scalar::all(0), usageFlags);
 }
 
-UMat UMat::zeros(int ndims, const int* sz, int type)
+UMat UMat::zeros(int ndims, const int* sz, int type, UMatUsageFlags usageFlags)
 {
-    return UMat(ndims, sz, type, Scalar::all(0));
+    return UMat(ndims, sz, type, Scalar::all(0), usageFlags);
 }
 
-UMat UMat::ones(int rows, int cols, int type)
+UMat UMat::ones(int rows, int cols, int type, UMatUsageFlags usageFlags)
 {
-    return UMat::ones(Size(cols, rows), type);
+    return UMat(rows, cols, type, Scalar(1), usageFlags);
 }
 
-UMat UMat::ones(Size size, int type)
+UMat UMat::ones(Size size, int type, UMatUsageFlags usageFlags)
 {
-    return UMat(size, type, Scalar(1));
+    return UMat(size, type, Scalar(1), usageFlags);
 }
 
-UMat UMat::ones(int ndims, const int* sz, int type)
+UMat UMat::ones(int ndims, const int* sz, int type, UMatUsageFlags usageFlags)
 {
-    return UMat(ndims, sz, type, Scalar(1));
+    return UMat(ndims, sz, type, Scalar(1), usageFlags);
 }
 
 }
diff --git a/modules/core/src/utils/logtagmanager.hpp b/modules/core/src/utils/logtagmanager.hpp
index 29a1776ada21..ab4bb9b7d3d4 100644
--- a/modules/core/src/utils/logtagmanager.hpp
+++ b/modules/core/src/utils/logtagmanager.hpp
@@ -37,8 +37,8 @@ class LogTagManager
     // also, extensible functions (accepting user-provided callback) are not allowed
     // to call LogTagManger (to prevent iterator invalidation), which needs enforced
     // with a non-recursive mutex.
-    using MutexType = std::mutex;
-    using LockType = std::lock_guard<MutexType>;
+    using MutexType = cv::Mutex;
+    using LockType = cv::AutoLock;
 
     enum class MatchingScope
     {
diff --git a/modules/core/test/ocl/test_matrix_expr.cpp b/modules/core/test/ocl/test_matrix_expr.cpp
index 7a5ff72cb24e..f11c0a6ebb6d 100644
--- a/modules/core/test/ocl/test_matrix_expr.cpp
+++ b/modules/core/test/ocl/test_matrix_expr.cpp
@@ -76,6 +76,24 @@ OCL_TEST_P(UMatExpr, Ones)
     }
 }
 
+//////////////////////////////// with usageFlags /////////////////////////////////////////////////
+
+OCL_TEST_P(UMatExpr, WithUsageFlags)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        UMat u0 = UMat::zeros(size, type, cv::USAGE_ALLOCATE_HOST_MEMORY);
+        UMat u1 = UMat::ones(size, type, cv::USAGE_ALLOCATE_HOST_MEMORY);
+        UMat u8 = UMat::eye(size, type, cv::USAGE_ALLOCATE_HOST_MEMORY);
+
+        EXPECT_EQ(cv::USAGE_ALLOCATE_HOST_MEMORY, u0.usageFlags);
+        EXPECT_EQ(cv::USAGE_ALLOCATE_HOST_MEMORY, u1.usageFlags);
+        EXPECT_EQ(cv::USAGE_ALLOCATE_HOST_MEMORY, u8.usageFlags);
+    }
+}
+
 //////////////////////////////// Instantiation /////////////////////////////////////////////////
 
 OCL_INSTANTIATE_TEST_CASE_P(MatrixOperation, UMatExpr, Combine(OCL_ALL_DEPTHS_16F, OCL_ALL_CHANNELS));
diff --git a/modules/core/test/test_arithm.cpp b/modules/core/test/test_arithm.cpp
index effb0e68e05b..9e8e242d604a 100644
--- a/modules/core/test/test_arithm.cpp
+++ b/modules/core/test/test_arithm.cpp
@@ -2166,6 +2166,15 @@ TEST(Core_Norm, IPP_regression_NORM_L1_16UC3_small)
     EXPECT_EQ((double)20*cn, cv::norm(a, b, NORM_L1, mask));
 }
 
+TEST(Core_Norm, NORM_L2_8UC4)
+{
+    // Tests there is no integer overflow in norm computation for multiple channels.
+    const int kSide = 100;
+    cv::Mat4b a(kSide, kSide, cv::Scalar(255, 255, 255, 255));
+    cv::Mat4b b = cv::Mat4b::zeros(kSide, kSide);
+    const double kNorm = 2.*kSide*255.;
+    EXPECT_EQ(kNorm, cv::norm(a, b, NORM_L2));
+}
 
 TEST(Core_ConvertTo, regression_12121)
 {
diff --git a/modules/core/test/test_async.cpp b/modules/core/test/test_async.cpp
index f898a22878d2..58bcfddcd769 100644
--- a/modules/core/test/test_async.cpp
+++ b/modules/core/test/test_async.cpp
@@ -7,7 +7,7 @@
 
 #include <opencv2/core/bindings_utils.hpp>
 
-#ifdef CV_CXX11
+#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
 #include <thread>
 #include <chrono>
 #endif
@@ -85,7 +85,8 @@ TEST(Core_Async, LikePythonTest)
 }
 
 
-#ifdef CV_CXX11
+#if defined(CV_CXX11) && !defined(OPENCV_DISABLE_THREAD_SUPPORT)
+
 TEST(Core_Async, AsyncThread_Simple)
 {
     Mat m(3, 3, CV_32FC1, Scalar::all(5.0f));
diff --git a/modules/core/test/test_io.cpp b/modules/core/test/test_io.cpp
index d30c48536888..3712be9f2e39 100644
--- a/modules/core/test/test_io.cpp
+++ b/modules/core/test/test_io.cpp
@@ -586,6 +586,7 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
     const ::testing::TestInfo* const test_info = ::testing::UnitTest::GetInstance()->current_test_info();
     CV_Assert(test_info);
     std::string name = (std::string(test_info->test_case_name()) + "--" + test_info->name() + suffix_name);
+    std::string name_34 = string(cvtest::TS::ptr()->get_data_path()) + "io/3_4/" + name;
     if (!testReadWrite)
         name = string(cvtest::TS::ptr()->get_data_path()) + "io/" + name;
 
@@ -661,7 +662,23 @@ static void test_filestorage_basic(int write_flags, const char* suffix_name, boo
                 std::ifstream f(name.c_str(), std::ios::in|std::ios::binary);
                 f.seekg(0, std::fstream::end);
                 sz = (size_t)f.tellg();
+
+                f.seekg(0, std::ios::beg);
+                std::vector<char> test_data(sz);
+                f.read(&test_data[0], sz);
                 f.close();
+
+                std::ifstream reference(name_34.c_str(), std::ios::in|std::ios::binary);
+                ASSERT_TRUE(reference.is_open());
+                reference.seekg(0, std::fstream::end);
+                size_t ref_sz = (size_t)reference.tellg();
+
+                reference.seekg(0, std::ios::beg);
+                std::vector<char> reference_data(ref_sz);
+                reference.read(&reference_data[0], ref_sz);
+                reference.close();
+
+                EXPECT_EQ(reference_data, test_data);
             }
             std::cout << "Storage size: " << sz << std::endl;
             EXPECT_LE(sz, (size_t)6000);
@@ -757,27 +774,27 @@ TEST(Core_InputOutput, filestorage_base64_basic_read_JSON)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".json", false);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_rw_XML)
+TEST(Core_InputOutput, filestorage_base64_basic_rw_XML)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".xml", true);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_rw_YAML)
+TEST(Core_InputOutput, filestorage_base64_basic_rw_YAML)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".yml", true);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_rw_JSON)
+TEST(Core_InputOutput, filestorage_base64_basic_rw_JSON)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".json", true);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_memory_XML)
+TEST(Core_InputOutput, filestorage_base64_basic_memory_XML)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".xml", true, true);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_memory_YAML)
+TEST(Core_InputOutput, filestorage_base64_basic_memory_YAML)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".yml", true, true);
 }
-TEST(Core_InputOutput, DISABLED_filestorage_base64_basic_memory_JSON)
+TEST(Core_InputOutput, filestorage_base64_basic_memory_JSON)
 {
     test_filestorage_basic(cv::FileStorage::WRITE_BASE64, ".json", true, true);
 }
@@ -1837,4 +1854,69 @@ TEST(Core_InputOutput, FileStorage_copy_constructor_17412_heap)
     EXPECT_EQ(0, remove(fname.c_str()));
 }
 
+
+static void test_20279(FileStorage& fs)
+{
+    Mat m32fc1(5, 10, CV_32FC1, Scalar::all(0));
+    for (size_t i = 0; i < m32fc1.total(); i++)
+    {
+        float v = (float)i;
+        m32fc1.at<float>((int)i) = v * 0.5f;
+    }
+    Mat m16fc1;
+    // produces CV_16S output: convertFp16(m32fc1, m16fc1);
+    m32fc1.convertTo(m16fc1, CV_16FC1);
+    EXPECT_EQ(CV_16FC1, m16fc1.type()) << typeToString(m16fc1.type());
+    //std::cout << m16fc1 << std::endl;
+
+    Mat m32fc3(4, 3, CV_32FC3, Scalar::all(0));
+    for (size_t i = 0; i < m32fc3.total(); i++)
+    {
+        float v = (float)i;
+        m32fc3.at<Vec3f>((int)i) = Vec3f(v, v * 0.2f, -v);
+    }
+    Mat m16fc3;
+    m32fc3.convertTo(m16fc3, CV_16FC3);
+    EXPECT_EQ(CV_16FC3, m16fc3.type()) << typeToString(m16fc3.type());
+    //std::cout << m16fc3 << std::endl;
+
+    fs << "m16fc1" << m16fc1;
+    fs << "m16fc3" << m16fc3;
+
+    string content = fs.releaseAndGetString();
+    if (cvtest::debugLevel > 0) std::cout << content << std::endl;
+
+    FileStorage fs_read(content, FileStorage::READ + FileStorage::MEMORY);
+    Mat m16fc1_result;
+    Mat m16fc3_result;
+    fs_read["m16fc1"] >> m16fc1_result;
+    ASSERT_FALSE(m16fc1_result.empty());
+    EXPECT_EQ(CV_16FC1, m16fc1_result.type()) << typeToString(m16fc1_result.type());
+    EXPECT_LE(cvtest::norm(m16fc1_result, m16fc1, NORM_INF), 1e-2);
+
+    fs_read["m16fc3"] >> m16fc3_result;
+    ASSERT_FALSE(m16fc3_result.empty());
+    EXPECT_EQ(CV_16FC3, m16fc3_result.type()) << typeToString(m16fc3_result.type());
+    EXPECT_LE(cvtest::norm(m16fc3_result, m16fc3, NORM_INF), 1e-2);
+}
+
+TEST(Core_InputOutput, FileStorage_16F_xml)
+{
+    FileStorage fs("test.xml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    test_20279(fs);
+}
+
+TEST(Core_InputOutput, FileStorage_16F_yml)
+{
+    FileStorage fs("test.yml", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    test_20279(fs);
+}
+
+TEST(Core_InputOutput, FileStorage_16F_json)
+{
+    FileStorage fs("test.json", cv::FileStorage::WRITE | cv::FileStorage::MEMORY);
+    test_20279(fs);
+}
+
+
 }} // namespace
diff --git a/modules/core/test/test_utils.cpp b/modules/core/test/test_utils.cpp
index ed5f34603de5..c31ca75667e9 100644
--- a/modules/core/test/test_utils.cpp
+++ b/modules/core/test/test_utils.cpp
@@ -8,9 +8,12 @@
 #include "opencv2/core/utils/logger.hpp"
 #include "opencv2/core/utils/buffer_area.private.hpp"
 
-#include "test_utils_tls.impl.hpp"
 #include "opencv2/core/utils/filesystem.private.hpp"
 
+#ifndef OPENCV_DISABLE_THREAD_SUPPORT
+#include "test_utils_tls.impl.hpp"
+#endif
+
 namespace opencv_test { namespace {
 
 static const char * const keys =
diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 3ae87ef72edd..54406c799010 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -8,7 +8,8 @@ endif()
 
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
-ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX)
+ocv_add_dispatched_file_force_all("layers/layers_common" AVX AVX2 AVX512_SKX RVV)
+ocv_add_dispatched_file_force_all("int8layers/layers_common" AVX2 AVX512_SKX)
 
 ocv_add_module(dnn opencv_core opencv_imgproc WRAP python java objc js)
 
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 24d35646df17..fbe16850d4d5 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -165,6 +165,40 @@ CV__DNN_INLINE_NS_BEGIN
         int outputNameToIndex(const String& outputName) CV_OVERRIDE;
     };
 
+    /** @brief GRU recurrent one-layer
+     *
+     * Accepts input sequence and computes the final hidden state for each element in the batch.
+     *
+     * - input[0] containing the features of the input sequence.
+     * input[0] should have shape [`T`, `N`, `data_dims`] where `T` is sequence length, `N` is batch size, `data_dims` is input size
+     * - output would have shape [`T`, `N`, `D` * `hidden_size`] where `D = 2` if layer is bidirectional otherwise `D = 1`
+     *
+     * Depends on the following attributes:
+     * - hidden_size - Number of neurons in the hidden layer
+     * - direction - RNN could be bidirectional or forward
+     *
+     * The final hidden state @f$ h_t @f$ computes by the following formulas:
+     *
+     @f{eqnarray*}{
+     r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+     z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+     n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+     h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)} \\
+     @f}
+     * Where @f$x_t@f$ is current input, @f$h_{(t-1)}@f$ is previous or initial hidden state.
+     *
+     * @f$W_{x?}@f$, @f$W_{h?}@f$ and @f$b_{?}@f$ are learned weights represented as matrices:
+     * @f$W_{x?} \in R^{N_h \times N_x}@f$, @f$W_{h?} \in R^{N_h \times N_h}@f$, @f$b_? \in R^{N_h}@f$.
+     *
+     * @f$\odot@f$ is per-element multiply operation.
+    */
+    class CV_EXPORTS GRULayer : public Layer
+    {
+    public:
+        /** Creates instance of GRU layer */
+        static Ptr<GRULayer> create(const LayerParams& params);
+    };
+
     /** @brief Classical recurrent layer
 
     Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
@@ -224,6 +258,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
+    {
+    public:
+        int input_zp, output_zp;
+        float output_sc;
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
     {
     public:
@@ -266,6 +308,13 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<PoolingLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS PoolingLayerInt8 : public PoolingLayer
+    {
+    public:
+        int input_zp, output_zp;
+        static Ptr<PoolingLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS SoftmaxLayer : public Layer
     {
     public:
@@ -274,6 +323,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<SoftmaxLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS SoftmaxLayerInt8 : public SoftmaxLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS InnerProductLayer : public Layer
     {
     public:
@@ -281,6 +338,13 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<InnerProductLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS InnerProductLayerInt8 : public InnerProductLayer
+    {
+    public:
+        int output_zp;
+        static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS MVNLayer : public Layer
     {
     public:
@@ -307,6 +371,22 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<FlattenLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS QuantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<QuantizeLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS DequantizeLayer : public Layer
+    {
+    public:
+        float scale;
+        int zeropoint;
+        static Ptr<DequantizeLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ConcatLayer : public Layer
     {
     public:
@@ -318,6 +398,7 @@ CV__DNN_INLINE_NS_BEGIN
          * Details: https://github.com/torch/nn/blob/master/doc/containers.md#depthconcat
          */
         bool padding;
+        int paddingValue;
 
         static Ptr<ConcatLayer> create(const LayerParams &params);
     };
@@ -425,7 +506,11 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const = 0;
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
+        virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
+                                  size_t outPlaneSize, int cn0, int cn1) const {};
     };
 
     class CV_EXPORTS ReLULayer : public ActivationLayer
@@ -508,6 +593,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<ExpLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS ActivationLayerInt8 : public ActivationLayer
+    {
+    public:
+        static Ptr<ActivationLayerInt8> create(const LayerParams &params);
+    };
+
     /* Layers used in semantic segmentation */
 
     class CV_EXPORTS CropLayer : public Layer
@@ -529,6 +620,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<EltwiseLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS EltwiseLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS BatchNormLayer : public ActivationLayer
     {
     public:
@@ -538,6 +635,14 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<BatchNormLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS BatchNormLayerInt8 : public BatchNormLayer
+    {
+    public:
+        float input_sc, output_sc;
+        int input_zp, output_zp;
+        static Ptr<BatchNormLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS MaxUnpoolLayer : public Layer
     {
     public:
@@ -557,12 +662,26 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<ScaleLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ScaleLayerInt8 : public ScaleLayer
+    {
+    public:
+        float output_sc;
+        int output_zp;
+        static Ptr<ScaleLayerInt8> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ShiftLayer : public Layer
     {
     public:
         static Ptr<Layer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ShiftLayerInt8 : public Layer
+    {
+    public:
+        static Ptr<Layer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS DataAugmentationLayer : public Layer
     {
     public:
@@ -689,6 +808,15 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<Layer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS CumSumLayer : public Layer
+    {
+    public:
+        int exclusive;
+        int reverse;
+
+        static Ptr<CumSumLayer> create(const LayerParams& params);
+    };
+
 //! @}
 //! @}
 CV__DNN_INLINE_NS_END
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index c7be7b8dc440..7624d43894a6 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -236,6 +236,15 @@ CV__DNN_INLINE_NS_BEGIN
          */
         virtual void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals);
 
+        /** @brief Tries to quantize the given layer and compute the quantization parameters required for fixed point implementation.
+         *  @param[in] scales input and output scales.
+         *  @param[in] zeropoints input and output zeropoints.
+         *  @param[out] params Quantized parameters required for fixed point implementation of that layer.
+         *  @returns True if layer can be quantized.
+         */
+        virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                                 const std::vector<std::vector<int> > &zeropoints, LayerParams& params);
+
         /** @brief Given the @p input blobs, computes the output @p blobs.
          *  @param[in]  inputs  the input blobs.
          *  @param[out] outputs allocated output blobs, which will store results of the computation.
@@ -371,6 +380,16 @@ CV__DNN_INLINE_NS_BEGIN
          */
         virtual void getScaleShift(Mat& scale, Mat& shift) const;
 
+        /**
+         * @brief Returns scale and zeropoint of layers
+         * @param[out] scale Output scale
+         * @param[out] zeropoint Output zeropoint
+         *
+         * By default, @p scale is 1 and @p zeropoint is 0.
+         */
+        virtual void getScaleZeropoint(float& scale, int& zeropoint) const;
+
+
         /**
          * @brief "Deattaches" all the layers, attached to particular layer.
          */
@@ -456,13 +475,21 @@ CV__DNN_INLINE_NS_BEGIN
         /** @brief Adds new layer to the net.
          *  @param name   unique name of the adding layer.
          *  @param type   typename of the adding layer (type must be registered in LayerRegister).
+         *  @param dtype  datatype of output blobs.
          *  @param params parameters which will be used to initialize the creating layer.
          *  @returns unique identifier of created layer, or -1 if a failure will happen.
          */
+        int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload Datatype of output blobs set to default CV_32F */
         int addLayer(const String &name, const String &type, LayerParams &params);
+
         /** @brief Adds new layer and connects its first input to the first output of previously added layer.
          *  @see addLayer()
          */
+        int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
+
+        /** @overload */
         int addLayerToPrev(const String &name, const String &type, LayerParams &params);
 
         /** @brief Converts string name of the layer to the integer identifier.
@@ -554,6 +581,25 @@ CV__DNN_INLINE_NS_BEGIN
         CV_WRAP_AS(forwardAndRetrieve) void forward(CV_OUT std::vector<std::vector<Mat> >& outputBlobs,
                                                     const std::vector<String>& outBlobNames);
 
+        /** @brief Returns a quantized Net from a floating-point Net.
+         *  @param calibData Calibration data to compute the quantization parameters.
+         *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
+         *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
+         */
+        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype);
+
+        /** @brief Returns input scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning input scales.
+         *  @param zeropoints output parameter for returning input zeropoints.
+         */
+        CV_WRAP void getInputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
+        /** @brief Returns output scale and zeropoint for a quantized Net.
+         *  @param scales output parameter for returning output scales.
+         *  @param zeropoints output parameter for returning output zeropoints.
+         */
+        CV_WRAP void getOutputDetails(CV_OUT std::vector<float>& scales, CV_OUT std::vector<int>& zeropoints) const;
+
         /**
          * @brief Compile Halide layers.
          * @param[in] scheduler Path to YAML file with scheduling directives.
@@ -1376,7 +1422,9 @@ class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
 
     /**
      * @brief Set the decoding method of translating the network output into string
-     * @param[in] decodeType The decoding method of translating the network output into string: {'CTC-greedy': greedy decoding for the output of CTC-based methods}
+     * @param[in] decodeType The decoding method of translating the network output into string, currently supported type:
+     *    - `"CTC-greedy"` greedy decoding for the output of CTC-based methods
+     *    - `"CTC-prefix-beam-search"` Prefix beam search decoding for the output of CTC-based methods
      */
     CV_WRAP
     TextRecognitionModel& setDecodeType(const std::string& decodeType);
@@ -1388,6 +1436,15 @@ class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
     CV_WRAP
     const std::string& getDecodeType() const;
 
+    /**
+     * @brief Set the decoding method options for `"CTC-prefix-beam-search"` decode usage
+     * @param[in] beamSize Beam size for search
+     * @param[in] vocPruneSize Parameter to optimize big vocabulary search,
+     * only take top @p vocPruneSize tokens in each search step, @p vocPruneSize <= 0 stands for disable this prune.
+     */
+    CV_WRAP
+    TextRecognitionModel& setDecodeOptsCTCPrefixBeamSearch(int beamSize, int vocPruneSize = 0);
+
     /**
      * @brief Set the vocabulary for recognition.
      * @param[in] vocabulary the associated vocabulary of the network.
diff --git a/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp b/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
index 46a58f09bc8b..e944644f8f21 100644
--- a/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
+++ b/modules/dnn/include/opencv2/dnn/layer_reg.private.hpp
@@ -12,10 +12,16 @@ CV__DNN_INLINE_NS_BEGIN
 //! @addtogroup dnn
 //! @{
 
-//! Register layer types of DNN model.
 typedef std::map<std::string, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
+
+//! Register layer types of DNN model.
+//!
+//! @note In order to thread-safely access the factory, see getLayerFactoryMutex() function.
 LayerFactory_Impl& getLayerFactoryImpl();
 
+//! Get the mutex guarding @ref LayerFactory_Impl, see getLayerFactoryImpl() function.
+Mutex& getLayerFactoryMutex();
+
 //! @}
 CV__DNN_INLINE_NS_END
 }
diff --git a/modules/dnn/include/opencv2/dnn/utils/debug_utils.hpp b/modules/dnn/include/opencv2/dnn/utils/debug_utils.hpp
new file mode 100644
index 000000000000..71dd3ab8d670
--- /dev/null
+++ b/modules/dnn/include/opencv2/dnn/utils/debug_utils.hpp
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+#define OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
+
+#include "../dnn.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+/**
+ * @brief Skip model import after diagnostic run in readNet() functions.
+ * @param[in] skip Indicates whether to skip the import.
+ *
+ * This is an internal OpenCV function not intended for users.
+ */
+CV_EXPORTS void skipModelImport(bool skip);
+
+CV__DNN_INLINE_NS_END
+}} // namespace
+
+#endif // OPENCV_DNN_UTILS_DEBUG_UTILS_HPP
diff --git a/modules/dnn/misc/java/gen_dict.json b/modules/dnn/misc/java/gen_dict.json
index 5a397eac51c0..65ecfdc25ea7 100644
--- a/modules/dnn/misc/java/gen_dict.json
+++ b/modules/dnn/misc/java/gen_dict.json
@@ -54,7 +54,7 @@
                 ]
 
             ],
-            "jni_name": "(*(cv::dnn::DictValue*)%(n)s_nativeObj)",
+            "jni_name": "(*(*(Ptr<cv::dnn::DictValue>*)%(n)s_nativeObj))",
             "jni_type": "jlong",
             "suffix": "J",
             "j_import": "org.opencv.dnn.DictValue"
diff --git a/modules/dnn/misc/python/test/test_dnn.py b/modules/dnn/misc/python/test/test_dnn.py
index d0687ca4bc41..31ee70b21294 100644
--- a/modules/dnn/misc/python/test/test_dnn.py
+++ b/modules/dnn/misc/python/test/test_dnn.py
@@ -62,6 +62,12 @@ def printParams(backend, target):
     }
     print('%s/%s' % (backendNames[backend], targetNames[target]))
 
+def getDefaultThreshold(target):
+    if target == cv.dnn.DNN_TARGET_OPENCL_FP16 or target == cv.dnn.DNN_TARGET_MYRIAD:
+        return 4e-3
+    else:
+        return 1e-5
+
 testdata_required = bool(os.environ.get('OPENCV_DNN_TEST_REQUIRE_TESTDATA', False))
 
 g_dnnBackendsAndTargets = None
@@ -373,5 +379,35 @@ def forward(self, inputs):
 
         cv.dnn_unregisterLayer('CropCaffe')
 
+    # check that dnn module can work with 3D tensor as input for network
+    def test_input_3d(self):
+        model = self.find_dnn_file('dnn/onnx/models/hidden_lstm.onnx')
+        input_file = self.find_dnn_file('dnn/onnx/data/input_hidden_lstm.npy')
+        output_file = self.find_dnn_file('dnn/onnx/data/output_hidden_lstm.npy')
+        if model is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/onnx/models/hidden_lstm.onnx). "
+                                    "Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+        if input_file is None or output_file is None:
+            raise unittest.SkipTest("Missing DNN test files (dnn/onnx/data/{input/output}_hidden_lstm.npy). "
+                                    "Verify OPENCV_DNN_TEST_DATA_PATH configuration parameter.")
+
+        net = cv.dnn.readNet(model)
+        input = np.load(input_file)
+        # we have to expand the shape of input tensor because Python bindings cut 3D tensors to 2D
+        # it should be fixed in future. see : https://github.com/opencv/opencv/issues/19091
+        # please remove `expand_dims` after that
+        input = np.expand_dims(input, axis=3)
+        gold_output = np.load(output_file)
+        net.setInput(input)
+
+        for backend, target in self.dnnBackendsAndTargets:
+            printParams(backend, target)
+
+            net.setPreferableBackend(backend)
+            net.setPreferableTarget(target)
+            real_output = net.forward()
+
+            normAssert(self, real_output, gold_output, "", getDefaultThreshold(target))
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/dnn/src/caffe/caffe_io.cpp b/modules/dnn/src/caffe/caffe_io.cpp
index 2fc4d84f4604..ebecf95eea3a 100644
--- a/modules/dnn/src/caffe/caffe_io.cpp
+++ b/modules/dnn/src/caffe/caffe_io.cpp
@@ -92,6 +92,7 @@
 #ifdef HAVE_PROTOBUF
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/stubs/common.h>
 #include <google/protobuf/text_format.h>
 
 #include <opencv2/core.hpp>
@@ -1111,7 +1112,11 @@ static const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 by
 
 bool ReadProtoFromBinary(ZeroCopyInputStream* input, Message *proto) {
     CodedInputStream coded_input(input);
+#if GOOGLE_PROTOBUF_VERSION >= 3006000
+    coded_input.SetTotalBytesLimit(kProtoReadBytesLimit);
+#else
     coded_input.SetTotalBytesLimit(kProtoReadBytesLimit, 536870912);
+#endif
 
     return proto->ParseFromCodedStream(&coded_input);
 }
diff --git a/modules/dnn/src/debug_utils.cpp b/modules/dnn/src/debug_utils.cpp
new file mode 100644
index 000000000000..d951205bd876
--- /dev/null
+++ b/modules/dnn/src/debug_utils.cpp
@@ -0,0 +1,91 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+#include <sstream>
+
+#include <opencv2/dnn/layer_reg.private.hpp>
+#include <opencv2/dnn/utils/debug_utils.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+bool DNN_DIAGNOSTICS_RUN = false;
+bool DNN_SKIP_REAL_IMPORT = false;
+
+void enableModelDiagnostics(bool isDiagnosticsMode)
+{
+    DNN_DIAGNOSTICS_RUN = isDiagnosticsMode;
+
+    if (DNN_DIAGNOSTICS_RUN)
+    {
+        detail::NotImplemented::Register();
+    }
+    else
+    {
+        detail::NotImplemented::unRegister();
+    }
+}
+
+void skipModelImport(bool skip)
+{
+    DNN_SKIP_REAL_IMPORT = skip;
+}
+
+void detail::LayerHandler::addMissing(const std::string& name, const std::string& type)
+{
+    cv::AutoLock lock(getLayerFactoryMutex());
+    auto& registeredLayers = getLayerFactoryImpl();
+
+    // If we didn't add it, but can create it, it's custom and not missing.
+    if (layers.find(type) == layers.end() && registeredLayers.find(type) != registeredLayers.end())
+    {
+        return;
+    }
+
+    layers[type].insert(name);
+}
+
+bool detail::LayerHandler::contains(const std::string& type) const
+{
+    return layers.find(type) != layers.end();
+}
+
+void detail::LayerHandler::printMissing()
+{
+    if (layers.empty())
+    {
+        return;
+    }
+
+    std::stringstream ss;
+    ss << "DNN: Not supported types:\n";
+    for (const auto& type_names : layers)
+    {
+        const auto& type = type_names.first;
+        ss << "Type='" << type << "', affected nodes:\n[";
+        for (const auto& name : type_names.second)
+        {
+            ss << "'" << name << "', ";
+        }
+        ss.seekp(-2, std::ios_base::end);
+        ss << "]\n";
+    }
+    CV_LOG_ERROR(NULL, ss.str());
+}
+
+LayerParams detail::LayerHandler::getNotImplementedParams(const std::string& name, const std::string& op)
+{
+    LayerParams lp;
+    lp.name = name;
+    lp.type = "NotImplemented";
+    lp.set("type", op);
+
+    return lp;
+}
+
+CV__DNN_INLINE_NS_END
+}} // namespace
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 75f32b65e6c6..7c4950eb9255 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -95,13 +95,6 @@ static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN
 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
 
-bool DNN_DIAGNOSTICS_RUN = false;
-
-void enableModelDiagnostics(bool isDiagnosticsMode)
-{
-    DNN_DIAGNOSTICS_RUN = isDiagnosticsMode;
-}
-
 using std::vector;
 using std::map;
 using std::make_pair;
@@ -573,9 +566,9 @@ struct LayerPin
 
 struct LayerData
 {
-    LayerData() : id(-1), skip(false), flag(0) {}
-    LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
-        : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
+    LayerData() : id(-1), dtype(CV_32F), skip(false), flag(0) {}
+    LayerData(int _id, const String &_name, const String &_type, const int &_dtype, LayerParams &_params)
+        : id(_id), name(_name), type(_type), dtype(_dtype), params(_params), skip(false), flag(0)
     {
         CV_TRACE_FUNCTION();
 
@@ -587,6 +580,7 @@ struct LayerData
     int id;
     String name;
     String type;
+    int dtype; // Datatype of output blobs.
     LayerParams params;
 
     std::vector<LayerPin> inputBlobsId;
@@ -943,7 +937,7 @@ struct BlobManager
         }
     }
 
-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, const int& dtype)
     {
         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
         {
@@ -965,7 +959,8 @@ struct BlobManager
                 {
                     Mat& unusedBlob = hostIt->second;
                     if (unusedBlob.total() >= targetTotal &&
-                        unusedBlob.total() < bestBlobTotal)
+                        unusedBlob.total() < bestBlobTotal &&
+                        unusedBlob.type() == dtype)
                     {
                         bestBlobPin = hostIt->first;
                         bestBlob = unusedBlob;
@@ -984,14 +979,13 @@ struct BlobManager
         {
             // if dst already has been allocated with total(shape) elements,
             // it won't be recreated and pointer of dst.data remains the same.
-            dst.create(shape, use_half ? CV_16S : CV_32F);
+            dst.create(shape, dtype);
             addHost(lp, dst);
         }
     }
 
     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
-                               std::vector<LayerPin>& pinsForInternalBlobs,
-                               bool use_half = false)
+                               std::vector<LayerPin>& pinsForInternalBlobs)
     {
         CV_TRACE_FUNCTION();
 
@@ -1062,7 +1056,7 @@ struct BlobManager
                         reuse(ld.inputBlobsId[0], blobPin);
                     }
                     else
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], ld.dtype);
                 }
             }
         }
@@ -1200,6 +1194,7 @@ struct Net::Impl : public detail::NetImplBase
 
         lastLayerId = 0;
         netWasAllocated = false;
+        netWasQuantized = false;
         fusion = true;
         isAsync = false;
         preferableBackend = DNN_BACKEND_DEFAULT;
@@ -1224,6 +1219,7 @@ struct Net::Impl : public detail::NetImplBase
     int lastLayerId;
 
     bool netWasAllocated;
+    bool netWasQuantized;
     bool fusion;
     bool isAsync;
     std::vector<int64> layersTimings;
@@ -1385,7 +1381,7 @@ struct Net::Impl : public detail::NetImplBase
 
             currLayer->unsetAttached();
         }
-
+        netWasAllocated = false;
         layersTimings.clear();
     }
 
@@ -2115,7 +2111,10 @@ struct Net::Impl : public detail::NetImplBase
 
             Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
             CV_Assert(!ieNode.empty());
-            ieNode->net->reset();
+
+            CV_Assert(ieNode->net);
+            InfEngineNgraphNet& ienet = *ieNode->net;
+            ienet.reset();
 
             for (it = layers.begin(); it != layers.end(); ++it)
             {
@@ -2132,16 +2131,26 @@ struct Net::Impl : public detail::NetImplBase
                 {
                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
                     {
-                        InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
-                        dataPtr->setName(ld.name);
+                        auto it = ienet.outputsDesc.find(ld.name);
+                        if (it != ienet.outputsDesc.end())
+                        {
+                            const InferenceEngine::TensorDesc& descriptor = it->second;
+                            InferenceEngine::DataPtr dataPtr = ngraphDataOutputNode(ld.outputBlobsWrappers[i], descriptor, ld.name);
+                            dataPtr->setName(ld.name);
+                        }
+                        else
+                        {
+                            InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
+                            dataPtr->setName(ld.name);
+                        }
                     }
                 }
-                ieNode->net->addBlobs(ld.inputBlobsWrappers);
-                ieNode->net->addBlobs(ld.outputBlobsWrappers);
+                ienet.addBlobs(ld.inputBlobsWrappers);
+                ienet.addBlobs(ld.outputBlobsWrappers);
                 ld.skip = true;
             }
             layers[lastLayerId].skip = false;
-            ieNode->net->init((Target)preferableTarget);
+            ienet.init((Target)preferableTarget);
             return;
         }
 
@@ -2821,10 +2830,11 @@ struct Net::Impl : public detail::NetImplBase
 
         CV_Assert(layerShapesIt != layersShapes.end());
 
+        if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F)
+            ld.dtype = CV_16S;
+
         std::vector<LayerPin> pinsForInternalBlobs;
-        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
-                                          preferableBackend == DNN_BACKEND_OPENCV &&
-                                          preferableTarget == DNN_TARGET_OPENCL_FP16);
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
         for (int i = 0; i < ld.outputBlobs.size(); ++i)
             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
@@ -3428,7 +3438,8 @@ struct Net::Impl : public detail::NetImplBase
             Mat& inp = layers[0].outputBlobs[i];
             CV_Assert(inp.total());
             if (preferableBackend == DNN_BACKEND_OPENCV &&
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
+                preferableTarget == DNN_TARGET_OPENCL_FP16 &&
+                layers[0].dtype == CV_32F)
             {
                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
             }
@@ -3742,6 +3753,25 @@ struct Net::Impl : public detail::NetImplBase
 #endif
     }
 
+    void getQuantizationParams(const Mat& src, std::vector<float>& scales, std::vector<int>& zeropoints)
+    {
+        const int qmin = -128; // INT8_MIN
+        const int qmax = 127;  // INT8_MAX
+
+        double rmin, rmax, sc, zp;
+        cv::minMaxIdx(src, &rmin, &rmax);
+
+        // 0 must be present in the range [rmin, rmax]
+        rmin = std::min(rmin, 0.0);
+        rmax = std::max(rmax, 0.0);
+
+        sc = (rmax == rmin) ? 1.0 : (rmax - rmin)/(qmax - qmin);
+        zp = qmin - (rmin/sc);
+
+        scales.push_back((float)sc);
+        zeropoints.push_back((int)std::round(zp));
+    }
+
     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
     {
         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
@@ -3872,7 +3902,8 @@ struct Net::Impl : public detail::NetImplBase
             Mat& inp = layers[0].outputBlobs[i];
             CV_Assert(inp.total());
             if (preferableBackend == DNN_BACKEND_OPENCV &&
-                preferableTarget == DNN_TARGET_OPENCL_FP16)
+                preferableTarget == DNN_TARGET_OPENCL_FP16 &&
+                layers[0].dtype == CV_32F)
             {
                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
             }
@@ -3898,7 +3929,7 @@ struct Net::Impl : public detail::NetImplBase
                     const MatShape& shape = layersShapes[inputLayerId].out[inputLayerIds[i].oid];
                     layersShapes[layerId].in.push_back(shape);
                 }
-                it->second.layerInstance->updateMemoryShapes(layersShapes[layerId].in);
+                it->second.getLayerInstance()->updateMemoryShapes(layersShapes[layerId].in);
             }
         }
     }
@@ -4303,35 +4334,58 @@ Net::~Net()
 {
 }
 
-int Net::addLayer(const String &name, const String &type, LayerParams &params)
+int Net::addLayer(const String &name, const String &type, const int &dtype, LayerParams &params)
 {
     CV_TRACE_FUNCTION();
 
-    if (impl->getLayerId(name) >= 0)
+    int id = impl->getLayerId(name);
+    if (id >= 0)
     {
-        CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
-        return -1;
+        if (!DNN_DIAGNOSTICS_RUN || type != "NotImplemented")
+        {
+            CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
+            return -1;
+        }
+        else
+        {
+            LayerData& ld = impl->layers.find(id)->second;
+            ld.type = type;
+            ld.params = params;
+            return -1;
+        }
     }
 
-    int id = ++impl->lastLayerId;
+    id = ++impl->lastLayerId;
     impl->layerNameToId.insert(std::make_pair(name, id));
-    impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
+    impl->layers.insert(std::make_pair(id, LayerData(id, name, type, dtype, params)));
     if (params.get<bool>("has_dynamic_shapes", false))
         impl->hasDynamicShapes = true;
 
     return id;
 }
 
-int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
+int Net::addLayer(const String &name, const String &type, LayerParams &params)
+{
+    CV_TRACE_FUNCTION();
+    return addLayer(name, type, CV_32F, params);
+}
+
+int Net::addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params)
 {
     CV_TRACE_FUNCTION();
 
     int prvLid = impl->lastLayerId;
-    int newLid = this->addLayer(name, type, params);
+    int newLid = this->addLayer(name, type, dtype, params);
     this->connect(prvLid, 0, newLid, 0);
     return newLid;
 }
 
+int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
+{
+    CV_TRACE_FUNCTION();
+    return addLayerToPrev(name, type, CV_32F, params);
+}
+
 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
 {
     CV_TRACE_FUNCTION();
@@ -4442,16 +4496,19 @@ void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
                 ld.outputBlobsWrappers[i]->copyToHost();
             }
         }
-        if (ld.outputBlobs[0].depth() == CV_32F)
+        if (ld.outputBlobs[0].depth() == CV_16S)
         {
-            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
-            outputvec = ld.outputBlobs;
-        } else {
             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
             outputvec.resize(ld.outputBlobs.size());
             for (int i = 0; i < outputvec.size(); i++)
                 convertFp16(ld.outputBlobs[i], outputvec[i]);
         }
+        else
+        {
+            // Output depth can be CV_32F or CV_8S
+            std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
+            outputvec = ld.outputBlobs;
+        }
     }
     else if (outputBlobs.isUMatVector())
     {
@@ -4504,8 +4561,8 @@ void Net::forward(OutputArrayOfArrays outputBlobs,
         matvec.push_back(impl->getBlob(pins[i]));
     }
 
-    std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
-    outputvec = matvec;
+    outputBlobs.create((int)matvec.size(), 1, CV_32F/*FIXIT*/, -1);  // allocate vector
+    outputBlobs.assign(matvec);
 }
 
 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
@@ -4537,11 +4594,277 @@ void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
     }
 }
 
+Net Net::quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype)
+{
+    CV_TRACE_FUNCTION();
+
+    // Net can be quantized only once.
+    if (impl->netWasQuantized)
+        CV_Error(Error::StsBadArg, "Cannot quantize a quantized net");
+
+    CV_CheckType(inputsDtype, inputsDtype == CV_32F || inputsDtype == CV_8S, "Input depth should be CV_32F or CV_8S");
+    CV_CheckType(outputsDtype, outputsDtype == CV_32F || outputsDtype == CV_8S, "Output depth should be CV_32F or CV_8S");
+
+    bool originalFusion = impl->fusion;
+    int prefBackend = impl->preferableBackend;
+    int prefTarget = impl->preferableTarget;
+
+    // Disable fusions and use CPU backend to quantize net
+    setPreferableBackend(DNN_BACKEND_OPENCV);
+    setPreferableTarget(DNN_TARGET_CPU);
+    enableFusion(false);
+
+    if (calibData.isMat())
+    {
+        setInput(calibData.getMat());
+    }
+    else if (calibData.isMatVector())
+    {
+        std::vector<Mat> calibDataVec;
+        calibData.getMatVector(calibDataVec);
+
+        std::vector<String> inpNames = impl->netInputLayer->outNames;
+        CV_CheckEQ(calibDataVec.size(), inpNames.size(), "Calibration data size should be equal to number of inputs");
+        for (int i = 0; i < calibDataVec.size(); i++)
+            setInput(calibDataVec[i], inpNames[i]);
+    }
+
+    std::vector<String> outNames = getUnconnectedOutLayersNames();
+    std::vector<LayerPin> pins;
+    for (int i = 0; i < outNames.size(); i++)
+        pins.push_back(impl->getPinByAlias(outNames[i]));
+    impl->setUpNet(pins);
+
+    // Compute scales and zeropoints for all the layers
+    std::vector<std::vector<float> > scales;
+    std::vector<std::vector<int> > zeropoints;
+    for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
+    {
+        LayerData& ld = it->second;
+        if (!ld.skip)
+        {
+            Ptr<Layer> layer = ld.layerInstance;
+            std::vector<Mat> inps(ld.inputBlobs.size());
+            for (int i = 0; i < ld.inputBlobs.size(); ++i)
+                inps[i] = *ld.inputBlobs[i];
+            layer->forward(inps, ld.outputBlobs, ld.internals);
+        }
+
+        std::vector<float> sc;
+        std::vector<int> zp;
+        if (ld.type == "TanH")
+        {
+            sc.push_back(1.f/128);
+            zp.push_back(0);
+        }
+        else if (ld.type == "Sigmoid" || ld.type == "Softmax" || ld.type == "SoftMax")
+        {
+            if (ld.params.get<bool>("log_softmax", false))
+            {
+                sc.push_back(16.f/256);
+                zp.push_back(127);
+            }
+            else
+            {
+                sc.push_back(1.f/256);
+                zp.push_back(-128);
+            }
+        }
+        else if (ld.type == "Split" || ld.type == "Slice" || ld.type == "Crop")
+        {
+            std::vector<float> inp_sc; std::vector<int> inp_zp;
+            impl->getQuantizationParams(*ld.inputBlobs[0], inp_sc, inp_zp);
+            sc.assign(ld.outputBlobs.size(), inp_sc[0]);
+            zp.assign(ld.outputBlobs.size(), inp_zp[0]);
+        }
+        else
+        {
+            for (int i = 0; i < ld.outputBlobs.size(); i++)
+                impl->getQuantizationParams(ld.outputBlobs[i], sc, zp);
+        }
+        scales.push_back(sc);
+        zeropoints.push_back(zp);
+    }
+
+    // For some layers, the input and output scales/zeropoints must be equal so that rescaling of inputs
+    // is not needed during quantized inference. We start from the last layer and modify the layer's input scales/zeropoints
+    // TODO : Need a different approach. Current solution fails when 2 such layers have the same input layer
+    for (Impl::MapIdToLayerData::reverse_iterator it = impl->layers.rbegin(); it != impl->layers.rend(); ++it)
+    {
+        LayerData& ld = it->second;
+        // Layers with multiple outputs. Number of outputs is equal to number of inputs
+        if (ld.type == "Blank" || ld.type == "Dropout" || ld.type == "Identity" || ld.type == "Silence" ||
+            ld.type == "Flatten" || ld.type == "Padding" || ld.type == "Permute" || ld.type == "Reshape" ||
+            ld.type == "ReLU6" || ld.type == "Reorg" || ld.type == "ShuffleChannel" ||
+           (ld.type == "ReLU" && !ld.params.get<float>("negative_slope", 0.f)) /* ReLU with negative slope 0 */)
+        {
+            for (int i = 0; i < ld.outputBlobs.size(); i++)
+            {
+                LayerPin &pin = ld.inputBlobsId[i];
+                scales[pin.lid][pin.oid] = scales[ld.id][i];
+                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][i];
+            }
+        }
+        // Layers with multiple inputs and single output.
+        else if ((ld.type == "Pooling" && toLowerCase(ld.params.get<String>("pool", "max")) == "max") /* Max Pooling */ ||
+                 (ld.type == "Eltwise" && toLowerCase(ld.params.get<String>("operation", "sum")) == "max") /* Elementwise max */ ||
+                  ld.type == "Concat")
+        {
+            for (int i = 0; i < ld.inputBlobsId.size(); i++)
+            {
+                LayerPin &pin = ld.inputBlobsId[i];
+                scales[pin.lid][pin.oid] = scales[ld.id][0];
+                zeropoints[pin.lid][pin.oid] = zeropoints[ld.id][0];
+            }
+        }
+    }
+
+    // Create a new Net and add quantized layers to it.
+    Net dstNet;
+    dstNet.impl->netWasQuantized = true;
+    dstNet.setInputsNames(impl->netInputLayer->outNames);
+    dstNet.setPreferableBackend(prefBackend);
+    dstNet.setPreferableTarget(prefTarget);
+    dstNet.enableFusion(originalFusion);
+
+    for (Impl::MapIdToLayerData::iterator it = impl->layers.begin(); it != impl->layers.end(); it++)
+    {
+        LayerData ld = it->second;
+        if (ld.id == 0)
+        {
+            LayerData &quantInpLd = dstNet.impl->layers[0];
+            quantInpLd.dtype = inputsDtype;
+            quantInpLd.params.set("scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+            quantInpLd.params.set("zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+            continue;
+        }
+
+        std::vector<LayerPin> inpPins = ld.inputBlobsId;
+        // Fill input and output scales/zeropoints for the layer
+        std::vector<std::vector<float> > inp_out_sc(2);
+        std::vector<std::vector<int> > inp_out_zp(2);
+        for (int i = 0; i < inpPins.size(); i++)
+        {
+            LayerPin &pin = inpPins[i];
+            inp_out_sc[0].push_back(scales[pin.lid][pin.oid]);
+            inp_out_zp[0].push_back(zeropoints[pin.lid][pin.oid]);
+        }
+        inp_out_sc[1] = scales[ld.id];
+        inp_out_zp[1] = zeropoints[ld.id];
+
+        // Quantize layer
+        Ptr<Layer> layer = ld.layerInstance;
+        if (layer->tryQuantize(inp_out_sc, inp_out_zp, ld.params))
+        {
+            ld.type += "Int8";
+            ld.dtype = CV_8S;
+        }
+        ld.params.set("scales", DictValue::arrayReal(inp_out_sc[1].data(), inp_out_sc[1].size()));
+        ld.params.set("zeropoints", DictValue::arrayInt(inp_out_zp[1].data(), inp_out_zp[1].size()));
+
+        // Check and add quantize/dequantize node before layer
+        for (int i = 0; i < inpPins.size(); i++)
+        {
+            LayerPin &pin = inpPins[i];
+            LayerData &inpLd = dstNet.impl->getLayerData(impl->getLayerName(pin.lid));
+            pin.lid = inpLd.id;
+            if (inpLd.dtype != ld.dtype)
+            {
+                String layerName = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? cv::format("quantize/%s/%d", inpLd.name.c_str(), pin.oid)
+                                                                                : cv::format("dequantize/%s/%d", inpLd.name.c_str(), pin.oid);
+                // Check if quantize/dequantize node for the input layer already exists
+                if (dstNet.impl->getLayerId(layerName) >= 0)
+                {
+                    pin.lid = dstNet.impl->getLayerId(layerName);
+                    pin.oid = 0;
+                }
+                else
+                {
+                    LayerParams lp;
+                    lp.set("scales", inp_out_sc[0][i]);
+                    lp.set("zeropoints", inp_out_zp[0][i]);
+                    lp.name = layerName;
+                    lp.type = (inpLd.dtype == CV_32F && ld.dtype == CV_8S) ? "Quantize" : "Dequantize";
+                    int newLid = dstNet.addLayer(lp.name, lp.type, ld.dtype, lp);
+                    dstNet.connect(pin.lid, pin.oid, newLid, 0);
+                    pin.lid = newLid; pin.oid = 0;
+                }
+            }
+        }
+
+        // Add quantized layer to Net and connect to its inputs.
+        int newLid = dstNet.addLayer(ld.name, ld.type, ld.dtype, ld.params);
+        for( int i = 0; i < inpPins.size(); i++ )
+            dstNet.connect(inpPins[i].lid, inpPins[i].oid, newLid, i);
+
+        // If the layer is a output layer, add quantize/dequantize node after it based on output's data type.
+        if (ld.requiredOutputs.size() == 0 && ld.dtype != outputsDtype)
+        {
+            LayerParams lp;
+            lp.set("scales", inp_out_sc[1][0]);
+            lp.set("zeropoints", inp_out_zp[1][0]);
+            lp.name = ((ld.dtype == CV_32F && outputsDtype == CV_8S) ? "quantize/" : "dequantize/") + ld.name;
+            lp.type = (ld.dtype == CV_32F && outputsDtype == CV_8S) ? "Quantize" : "Dequantize";
+            dstNet.addLayerToPrev(lp.name, lp.type, outputsDtype, lp);
+        }
+    }
+    // Restore FP32 Net's backend, target and fusion
+    setPreferableBackend(prefBackend);
+    setPreferableTarget(prefTarget);
+    enableFusion(originalFusion);
+    return dstNet;
+}
+
+void Net::getInputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
+{
+    if (!impl->netWasQuantized)
+        CV_Error(Error::StsBadFunc, "Net isn't quantized");
+
+    LayerParams &lp = impl->layers[0].params;
+    DictValue sc = lp.get("scales");
+    DictValue zp = lp.get("zeropoints");
+
+    for (int i = 0; i < sc.size(); i++)
+    {
+        scales.push_back(sc.get<float>(i));
+        zeropoints.push_back(zp.get<int>(i));
+    }
+}
+
+void Net::getOutputDetails(std::vector<float>& scales, std::vector<int>& zeropoints) const
+{
+    if (!impl->netWasQuantized)
+        CV_Error(Error::StsBadFunc, "Net isn't quantized");
+
+    std::vector<int> outLayerIds = getUnconnectedOutLayers();
+    for (auto &lid : outLayerIds)
+    {
+        LayerParams &lp = impl->layers[lid].params;
+        DictValue sc = lp.get("scales");
+        DictValue zp = lp.get("zeropoints");
+
+        for (int i = 0; i < sc.size(); i++)
+        {
+            scales.push_back(sc.get<float>(i));
+            zeropoints.push_back(zp.get<int>(i));
+        }
+    }
+}
+
 void Net::setPreferableBackend(int backendId)
 {
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(backendId);
 
+    if (backendId == DNN_BACKEND_DEFAULT)
+        backendId = (Backend)PARAM_DNN_BACKEND_DEFAULT;
+
+    if (impl->netWasQuantized && backendId != DNN_BACKEND_OPENCV)
+    {
+        CV_LOG_WARNING(NULL, "DNN: Only default backend supports quantized networks");
+        backendId = DNN_BACKEND_OPENCV;
+    }
+
 #ifdef HAVE_INF_ENGINE
     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
         backendId = getInferenceEngineBackendTypeParam();
@@ -4550,7 +4873,6 @@ void Net::setPreferableBackend(int backendId)
     if( impl->preferableBackend != backendId )
     {
         impl->preferableBackend = backendId;
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -4560,6 +4882,13 @@ void Net::setPreferableTarget(int targetId)
     CV_TRACE_FUNCTION();
     CV_TRACE_ARG(targetId);
 
+    if (impl->netWasQuantized && targetId != DNN_TARGET_CPU &&
+        targetId != DNN_TARGET_OPENCL && targetId != DNN_TARGET_OPENCL_FP16)
+    {
+        CV_LOG_WARNING(NULL, "DNN: Only CPU and OpenCL/OpenCL FP16 target is supported by quantized networks");
+        targetId = DNN_TARGET_CPU;
+    }
+
     if( impl->preferableTarget != targetId )
     {
         impl->preferableTarget = targetId;
@@ -4579,7 +4908,6 @@ void Net::setPreferableTarget(int targetId)
                 impl->preferableTarget = DNN_TARGET_OPENCL;
 #endif
         }
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -5211,9 +5539,10 @@ void Net::getMemoryConsumption(const int layerId,
 
     ShapesVec inLayerShapes, outLayerShapes;
     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
+    size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
     for(int i = 0; i < outLayerShapes.size(); i++)
     {
-        blobs += total(outLayerShapes[i]) * sizeof(float);
+        blobs += total(outLayerShapes[i]) * elemSize;
     }
 }
 
@@ -5262,7 +5591,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
 
     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
-
+    size_t elemSize = (impl->netWasQuantized) ? sizeof(char) : sizeof(float);
     for(int i = 0; i < layerIds.size(); i++)
     {
         int w = 0, b = 0;
@@ -5277,7 +5606,7 @@ void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
 
         for(int j = 0; j < outLayerShapes[i].size(); j++)
         {
-            b += total(outLayerShapes[i][j]) * sizeof(float);
+            b += total(outLayerShapes[i][j]) * elemSize;
         }
 
         weights.push_back(w);
@@ -5297,7 +5626,6 @@ void Net::enableFusion(bool fusion)
     if( impl->fusion != fusion )
     {
         impl->fusion = fusion;
-        impl->netWasAllocated = false;
         impl->clear();
     }
 }
@@ -5478,6 +5806,12 @@ void Layer::getScaleShift(Mat& scale, Mat& shift) const
     shift = Mat();
 }
 
+void Layer::getScaleZeropoint(float& scale, int& zeropoint) const
+{
+    scale = 1.f;
+    zeropoint = 0;
+}
+
 void Layer::unsetAttached()
 {
     setActivation(Ptr<ActivationLayer>());
@@ -5604,6 +5938,12 @@ void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::
     this->forward(inputs, outputs, internals);
 }
 
+bool Layer::tryQuantize(const std::vector<std::vector<float> > &scales,
+                        const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+{
+    return false;
+}
+
 Layer::~Layer() {}
 
 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -5622,7 +5962,7 @@ bool Layer::updateMemoryShapes(const std::vector<MatShape> &inputs)
 }
 //////////////////////////////////////////////////////////////////////////
 
-static Mutex& getLayerFactoryMutex()
+Mutex& getLayerFactoryMutex()
 {
     static Mutex* volatile instance = NULL;
     if (instance == NULL)
diff --git a/modules/dnn/src/dnn_common.hpp b/modules/dnn/src/dnn_common.hpp
index ff8f5e846724..3c68322e098c 100644
--- a/modules/dnn/src/dnn_common.hpp
+++ b/modules/dnn/src/dnn_common.hpp
@@ -5,6 +5,9 @@
 #ifndef __OPENCV_DNN_COMMON_HPP__
 #define __OPENCV_DNN_COMMON_HPP__
 
+#include <unordered_set>
+#include <unordered_map>
+
 #include <opencv2/dnn.hpp>
 
 namespace cv { namespace dnn {
@@ -13,7 +16,57 @@ CV__DNN_INLINE_NS_BEGIN
 Mutex& getInitializationMutex();
 void initializeLayerFactory();
 
+extern bool DNN_DIAGNOSTICS_RUN;
+extern bool DNN_SKIP_REAL_IMPORT;
+
 namespace detail {
+#define CALL_MEMBER_FN(object, ptrToMemFn)  ((object).*(ptrToMemFn))
+
+class NotImplemented : public Layer
+{
+public:
+    static Ptr<Layer> create(const LayerParams &params);
+
+    static void Register();
+    static void unRegister();
+};
+
+template <typename Importer, typename ... Args>
+Net readNet(Args&& ... args)
+{
+    Net net;
+    Importer importer(net, std::forward<Args>(args)...);
+    return net;
+}
+
+template <typename Importer, typename ... Args>
+Net readNetDiagnostic(Args&& ... args)
+{
+    Net maybeDebugNet = readNet<Importer>(std::forward<Args>(args)...);
+    if (DNN_DIAGNOSTICS_RUN && !DNN_SKIP_REAL_IMPORT)
+    {
+        // if we just imported the net in diagnostic mode, disable it and import again
+        enableModelDiagnostics(false);
+        Net releaseNet = readNet<Importer>(std::forward<Args>(args)...);
+        enableModelDiagnostics(true);
+        return releaseNet;
+    }
+    return maybeDebugNet;
+}
+
+class LayerHandler
+{
+public:
+    void addMissing(const std::string& name, const std::string& type);
+    bool contains(const std::string& type) const;
+    void printMissing();
+
+protected:
+    LayerParams getNotImplementedParams(const std::string& name, const std::string& op);
+
+private:
+    std::unordered_map<std::string, std::unordered_set<std::string>> layers;
+};
 
 struct NetImplBase
 {
diff --git a/modules/dnn/src/ie_ngraph.cpp b/modules/dnn/src/ie_ngraph.cpp
index 748403271425..a61766337e30 100644
--- a/modules/dnn/src/ie_ngraph.cpp
+++ b/modules/dnn/src/ie_ngraph.cpp
@@ -657,7 +657,11 @@ void InfEngineNgraphNet::initPlugin(InferenceEngine::CNNNetwork& net)
                 try
                 {
                     InferenceEngine::IExtensionPtr extension =
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
+                        std::make_shared<InferenceEngine::Extension>(libName);
+#else
                         InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(libName);
+#endif
 
                     ie.AddExtension(extension, "CPU");
                     CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << libName);
@@ -788,21 +792,32 @@ void NgraphBackendLayer::forward(InputArrayOfArrays inputs, OutputArrayOfArrays
 }
 
 
-static InferenceEngine::Layout estimateLayout(const Mat& m)
+static InferenceEngine::Layout estimateLayout(int dims)
 {
-    if (m.dims == 4)
+    if (dims == 4)
         return InferenceEngine::Layout::NCHW;
-    else if (m.dims == 3)
+    else if (dims == 3)
         return InferenceEngine::Layout::CHW;
-    else if (m.dims == 2)
+    else if (dims == 2)
         return InferenceEngine::Layout::NC;
-    else if (m.dims == 1)
+    else if (dims == 1)
         return InferenceEngine::Layout::C;
-    else if (m.dims == 5)
+    else if (dims == 5)
         return InferenceEngine::Layout::NCDHW;
     else
         return InferenceEngine::Layout::ANY;
 }
+static inline
+InferenceEngine::Layout estimateLayout(size_t dims)
+{
+    return estimateLayout((int)dims);
+}
+
+static inline
+InferenceEngine::Layout estimateLayout(const Mat& m)
+{
+    return estimateLayout(m.dims);
+}
 
 static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std::string& name = "")
 {
@@ -838,6 +853,7 @@ InferenceEngine::Blob::Ptr wrapToNgraphBlob(const Mat& m, InferenceEngine::Layou
 
 NgraphBackendWrapper::NgraphBackendWrapper(int targetId, const cv::Mat& m)
     : BackendWrapper(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, targetId)
+    , host((Mat*)&m)
 {
     dataPtr = wrapToInfEngineDataNode(m);
     blob = wrapToNgraphBlob(m, estimateLayout(m));
@@ -889,7 +905,11 @@ InferenceEngine::Blob::Ptr copyBlob(const InferenceEngine::Blob::Ptr& blob)
         copy = InferenceEngine::make_shared_blob<uint8_t>(description);
     }
     else
-        CV_Error(Error::StsNotImplemented, "Unsupported blob precision");
+    {
+        std::ostringstream msg;
+        msg << precision;
+        CV_Error_(Error::StsNotImplemented, ("Unsupported blob precision: %s", msg.str().c_str()));
+    }
     copy->allocate();
     return copy;
 }
@@ -902,6 +922,66 @@ InferenceEngine::DataPtr ngraphDataNode(const Ptr<BackendWrapper>& ptr)
     return p->dataPtr;
 }
 
+static
+InferenceEngine::Blob::Ptr reallocateBlob(Mat &m, const InferenceEngine::TensorDesc& description)
+{
+    auto dims = description.getDims();
+    auto layout = estimateLayout(dims.size());
+    MatShape matShape(dims.begin(), dims.end());
+    if (description.getPrecision() == InferenceEngine::Precision::FP32)
+    {
+        m.create(matShape, CV_32FC1);
+        return InferenceEngine::make_shared_blob<float>(
+                {description.getPrecision(), dims, layout}, (float*)m.data);
+    }
+    else if (description.getPrecision() == InferenceEngine::Precision::I32)
+    {
+        m.create(matShape, CV_32SC1);
+        return InferenceEngine::make_shared_blob<int>(
+                {description.getPrecision(), dims, layout}, (int*)m.data);
+    }
+    else if (description.getPrecision() == InferenceEngine::Precision::U8)
+    {
+        m.create(matShape, CV_8UC1);
+        return InferenceEngine::make_shared_blob<uchar>(
+                {description.getPrecision(), dims, layout}, (uchar*)m.data);
+    }
+    std::ostringstream msg;
+    msg << "Unsupported IE precision: " << description.getPrecision();
+    CV_Error(Error::StsNotImplemented, msg.str());
+}
+
+InferenceEngine::DataPtr ngraphDataOutputNode(
+        const Ptr<BackendWrapper>& ptr,
+        const InferenceEngine::TensorDesc& description,
+        const std::string name)
+{
+    CV_Assert(!ptr.empty());
+    Ptr<NgraphBackendWrapper> p = ptr.dynamicCast<NgraphBackendWrapper>();
+    CV_Assert(!p.empty());
+    NgraphBackendWrapper& w = *p;
+    const InferenceEngine::TensorDesc& blobDesc = w.blob.get()->getTensorDesc();
+    auto dims = description.getDims();
+    bool reallocate = false;
+    if (blobDesc.getPrecision() != description.getPrecision())
+    {
+        reallocate = true;
+        CV_LOG_WARNING(NULL, "Reallocate output '" << name << "' blob due to wrong precision: " << blobDesc.getPrecision() << " => " << description.getPrecision() << "  ndims=" << dims.size());
+    }
+    if (dims.size() != blobDesc.getDims().size())
+    {
+        reallocate = true;
+        CV_LOG_WARNING(NULL, "Reallocate output '" << name << "' blob due to wrong dims: " << blobDesc.getDims().size() << " => " << dims.size());
+    }
+    if (reallocate)
+    {
+        auto layout = estimateLayout(dims.size());
+        w.dataPtr = InferenceEngine::DataPtr(new InferenceEngine::Data(name,
+               {description.getPrecision(), dims, layout}));
+        w.blob = reallocateBlob(*w.host, description);
+    }
+    return w.dataPtr;
+}
 
 void forwardNgraph(const std::vector<Ptr<BackendWrapper> >& outBlobsWrappers,
                       Ptr<BackendNode>& node, bool isAsync)
@@ -917,6 +997,13 @@ void InfEngineNgraphNet::reset()
     allBlobs.clear();
     infRequests.clear();
     isInit = false;
+
+    outputsDesc.clear();
+    for (const auto& it : cnn.getOutputsInfo())
+    {
+        const std::string& name = it.first;
+        outputsDesc.insert({name, it.second->getTensorDesc()});
+    }
 }
 
 void InfEngineNgraphNet::addBlobs(const std::vector<cv::Ptr<BackendWrapper> >& ptrs)
@@ -1005,35 +1092,54 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
         reqWrapper->req.SetInput(inpBlobs);
         reqWrapper->req.SetOutput(outBlobs);
 
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
+        InferenceEngine::InferRequest infRequest = reqWrapper->req;
+        NgraphReqWrapper* wrapperPtr = reqWrapper.get();
+        CV_Assert(wrapperPtr && "Internal error");
+#else
         InferenceEngine::IInferRequest::Ptr infRequestPtr = reqWrapper->req;
-        infRequestPtr->SetUserData(reqWrapper.get(), 0);
+        CV_Assert(infRequestPtr);
+        InferenceEngine::IInferRequest& infRequest = *infRequestPtr.get();
+        infRequest.SetUserData(reqWrapper.get(), 0);
+#endif
 
-        infRequestPtr->SetCompletionCallback(
-            [](InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode status)
+#if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
+        // do NOT capture 'reqWrapper' (smart ptr) in the lambda callback
+        infRequest.SetCompletionCallback<std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>>(
+            [wrapperPtr](InferenceEngine::InferRequest /*request*/, InferenceEngine::StatusCode status)
+#else
+        infRequest.SetCompletionCallback(
+            [](InferenceEngine::IInferRequest::Ptr requestPtr, InferenceEngine::StatusCode status)
+#endif
             {
                 CV_LOG_DEBUG(NULL, "DNN(nGraph): completionCallback(" << (int)status << ")");
+#if !INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2021_4)
+                CV_Assert(requestPtr);
+                InferenceEngine::IInferRequest& request = *requestPtr.get();
 
-                NgraphReqWrapper* wrapper;
-                request->GetUserData((void**)&wrapper, 0);
-                CV_Assert(wrapper && "Internal error");
+                NgraphReqWrapper* wrapperPtr;
+                request.GetUserData((void**)&wrapperPtr, 0);
+                CV_Assert(wrapperPtr && "Internal error");
+#endif
+                NgraphReqWrapper& wrapper = *wrapperPtr;
 
                 size_t processedOutputs = 0;
                 try
                 {
-                    for (; processedOutputs < wrapper->outProms.size(); ++processedOutputs)
+                    for (; processedOutputs < wrapper.outProms.size(); ++processedOutputs)
                     {
-                        const std::string& name = wrapper->outsNames[processedOutputs];
-                        Mat m = ngraphBlobToMat(wrapper->req.GetBlob(name));
+                        const std::string& name = wrapper.outsNames[processedOutputs];
+                        Mat m = ngraphBlobToMat(wrapper.req.GetBlob(name));
 
                         try
                         {
                             CV_Assert(status == InferenceEngine::StatusCode::OK);
-                            wrapper->outProms[processedOutputs].setValue(m.clone());
+                            wrapper.outProms[processedOutputs].setValue(m.clone());
                         }
                         catch (...)
                         {
                             try {
-                                wrapper->outProms[processedOutputs].setException(std::current_exception());
+                                wrapper.outProms[processedOutputs].setException(std::current_exception());
                             } catch(...) {
                                 CV_LOG_ERROR(NULL, "DNN: Exception occurred during async inference exception propagation");
                             }
@@ -1043,16 +1149,16 @@ void InfEngineNgraphNet::forward(const std::vector<Ptr<BackendWrapper> >& outBlo
                 catch (...)
                 {
                     std::exception_ptr e = std::current_exception();
-                    for (; processedOutputs < wrapper->outProms.size(); ++processedOutputs)
+                    for (; processedOutputs < wrapper.outProms.size(); ++processedOutputs)
                     {
                         try {
-                            wrapper->outProms[processedOutputs].setException(e);
+                            wrapper.outProms[processedOutputs].setException(e);
                         } catch(...) {
                             CV_LOG_ERROR(NULL, "DNN: Exception occurred during async inference exception propagation");
                         }
                     }
                 }
-                wrapper->isReady = true;
+                wrapper.isReady = true;
             }
         );
     }
diff --git a/modules/dnn/src/ie_ngraph.hpp b/modules/dnn/src/ie_ngraph.hpp
index 7a8c4bef8d5c..617f1d454232 100644
--- a/modules/dnn/src/ie_ngraph.hpp
+++ b/modules/dnn/src/ie_ngraph.hpp
@@ -54,7 +54,8 @@ class InfEngineNgraphNet
     void setNodePtr(std::shared_ptr<ngraph::Node>* ptr);
 
     void reset();
-private:
+
+//private:
     detail::NetImplBase& netImpl_;
 
     void release();
@@ -89,6 +90,8 @@ class InfEngineNgraphNet
     bool hasNetOwner;
     std::vector<std::string> requestedOutputs;
     std::unordered_set<std::shared_ptr<ngraph::Node>> unconnectedNodes;
+
+    std::map<std::string, InferenceEngine::TensorDesc> outputsDesc;
 };
 
 class InfEngineNgraphNode : public BackendNode
@@ -121,12 +124,17 @@ class NgraphBackendWrapper : public BackendWrapper
     virtual void copyToHost() CV_OVERRIDE;
     virtual void setHostDirty() CV_OVERRIDE;
 
+    Mat* host;
     InferenceEngine::DataPtr dataPtr;
     InferenceEngine::Blob::Ptr blob;
     AsyncArray futureMat;
 };
 
 InferenceEngine::DataPtr ngraphDataNode(const Ptr<BackendWrapper>& ptr);
+InferenceEngine::DataPtr ngraphDataOutputNode(
+        const Ptr<BackendWrapper>& ptr,
+        const InferenceEngine::TensorDesc& description,
+        const std::string name);
 
 // This is a fake class to run networks from Model Optimizer. Objects of that
 // class simulate responses of layers are imported by OpenCV and supported by
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 698168817f5f..9d8a3783a2e9 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -139,6 +139,46 @@ void initializeLayerFactory()
     CV_DNN_REGISTER_LAYER_CLASS(FlowWarp,       FlowWarpLayer);
 
     CV_DNN_REGISTER_LAYER_CLASS(LSTM,           LSTMLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(GRU,            GRULayer);
+    CV_DNN_REGISTER_LAYER_CLASS(CumSum,         CumSumLayer);
+
+    CV_DNN_REGISTER_LAYER_CLASS(Quantize,         QuantizeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(Dequantize,       DequantizeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ConvolutionInt8,  ConvolutionLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(InnerProductInt8, InnerProductLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(PoolingInt8,      PoolingLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(EltwiseInt8,      EltwiseLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(BatchNormInt8,    BatchNormLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ScaleInt8,        ScaleLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ShiftInt8,        ShiftLayerInt8);
+
+    CV_DNN_REGISTER_LAYER_CLASS(ReLUInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ReLU6Int8,        ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SigmoidInt8,      ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(TanHInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SwishInt8,        ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(MishInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(ELUInt8,          ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(BNLLInt8,         ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(AbsValInt8,       ActivationLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SoftmaxInt8,      SoftmaxLayerInt8);
+    CV_DNN_REGISTER_LAYER_CLASS(SoftMaxInt8,      SoftmaxLayerInt8);
+
+    CV_DNN_REGISTER_LAYER_CLASS(ConcatInt8,       ConcatLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(FlattenInt8,      FlattenLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(PaddingInt8,      PaddingLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(BlankInt8,        BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(DropoutInt8,      BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(IdentityInt8,     BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SilenceInt8,      BlankLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ConstInt8,        ConstLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ReshapeInt8,      ReshapeLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SplitInt8,        SplitLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(SliceInt8,        SliceLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(CropInt8,         CropLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(PermuteInt8,      PermuteLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ReorgInt8,        ReorgLayer);
+    CV_DNN_REGISTER_LAYER_CLASS(ShuffleChannelInt8, ShuffleChannelLayer);
 }
 
 CV__DNN_INLINE_NS_END
diff --git a/modules/dnn/src/int8layers/batch_norm_layer.cpp b/modules/dnn/src/int8layers/batch_norm_layer.cpp
new file mode 100644
index 000000000000..c5b8c3d9e9b0
--- /dev/null
+++ b/modules/dnn/src/int8layers/batch_norm_layer.cpp
@@ -0,0 +1,178 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class BatchNormLayerInt8Impl CV_FINAL : public BatchNormLayerInt8
+{
+public:
+    Mat origin_weights, origin_bias;
+    Mat weights_, bias_;
+    mutable int dims;
+
+    BatchNormLayerInt8Impl(const LayerParams& params)
+        : dims(-1)
+    {
+        setParamsFrom(params);
+        useGlobalStats = params.get<bool>("use_global_stats", true);
+        input_sc = params.get<float>("input_scale");
+        input_zp = params.get<int>("input_zeropoint");
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+
+        CV_Assert(blobs.size() == 2);
+        size_t n = blobs[0].total();
+        CV_Assert(blobs[1].total() == n &&
+                  blobs[0].isContinuous() && blobs[1].isContinuous() &&
+                  blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
+
+        origin_weights = blobs[0];
+        origin_bias = blobs[1];
+    }
+
+    virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        origin_weights.convertTo(weights_, CV_32F, input_sc/output_sc);
+        addWeighted(origin_bias, 1.0/output_sc, weights_, -input_zp, output_zp, bias_, CV_32F);
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = origin_weights;
+        shift = origin_bias;
+    }
+
+    void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
+    {
+        scale = output_sc;
+        zeropoint = output_zp;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w_, b_;
+        top->getScaleShift(w_, b_);
+        if (w_.empty() && b_.empty())
+            return false;
+
+        const int numChannels = weights_.total();
+        const int numFusedWeights = w_.total();
+        const int numFusedBias = b_.total();
+
+        if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w_.empty()) ||
+            (numFusedBias != numChannels && numFusedBias != 1 && !b_.empty()))
+            return false;
+
+        float new_sc;
+        int new_zp;
+        top->getScaleZeropoint(new_sc, new_zp);
+
+        Mat w = numFusedWeights == 1 ? Mat(1, numChannels, CV_32F, Scalar(w_.at<float>(0))) :
+                (w_.empty() ? Mat::ones(1, numChannels, CV_32F) : w_.reshape(1, 1));
+
+        Mat b = numFusedBias == 1 ? Mat(1, numChannels, CV_32F, Scalar(b_.at<float>(0))) :
+                (b_.empty() ? Mat::zeros(1, numChannels, CV_32F) : b_.reshape(1, 1));
+
+        weights_ = Mat(); bias_ = Mat();
+        multiply(origin_weights, w, weights_, input_sc/new_sc, CV_32F);
+        multiply(origin_bias, w, bias_);
+        add(bias_, b, bias_);
+        addWeighted(bias_, 1.0/new_sc, weights_, -input_zp, new_zp, bias_, CV_32F);
+        return true;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        dims = inputs[0].size();
+        if (!useGlobalStats && inputs[0][0] != 1)
+            CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(blobs.size() == 2);
+        CV_Assert(inputs.size() == 1);
+
+        Mat &inpBlob = inputs[0];
+        int planeSize = 1;
+        for (size_t i = 2; i < inpBlob.dims; i++) {
+            planeSize *= inpBlob.size[i];
+        }
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &outBlob = outputs[ii];
+
+            for(int num = 0; num < outBlob.size[0]; num++)
+            {
+                for (int n = 0; n < outBlob.size[1]; n++)
+                {
+                    float w = weights_.at<float>(n);
+                    float b = bias_.at<float>(n);
+                    Mat inpBlobPlane(1, planeSize, CV_8S, inpBlob.ptr<int8_t>(num, n));
+                    Mat outBlobPlane(1, planeSize, CV_8S, outBlob.ptr<int8_t>(num, n));
+                    inpBlobPlane.convertTo(outBlobPlane, CV_8S, w, b);
+                }
+            }
+        }
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+
+        int64 flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 3*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool useGlobalStats;
+};
+
+Ptr<BatchNormLayerInt8> BatchNormLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<BatchNormLayerInt8>(new BatchNormLayerInt8Impl(params));
+}
+
+}  // namespace dnn
+}  // namespace cv
diff --git a/modules/dnn/src/int8layers/convolution_layer.cpp b/modules/dnn/src/int8layers/convolution_layer.cpp
new file mode 100644
index 000000000000..05749885c05b
--- /dev/null
+++ b/modules/dnn/src/int8layers/convolution_layer.cpp
@@ -0,0 +1,1136 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/core/utils/logger.hpp>
+
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+#include <iostream>
+#include <numeric>
+
+namespace cv
+{
+namespace dnn
+{
+
+#if CV_SIMD
+static inline void v_expand_mul_add(const v_int8x16& a, const v_int8x16& b,
+                                    v_int32x4& out0, v_int32x4& out1, v_int32x4& out2, v_int32x4& out3)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+
+    v_int32x4 t0, t1;
+    v_mul_expand(a0, b0, t0, t1);
+    out0 += t0; out1 += t1;
+
+    v_mul_expand(a1, b1, t0, t1);
+    out2 += t0; out3 += t1;
+}
+#endif
+
+class BaseConvolutionLayerInt8Impl : public ConvolutionLayerInt8
+{
+public:
+    BaseConvolutionLayerInt8Impl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        getConvolutionKernelParams(params, kernel_size, pads_begin, pads_end, strides, dilations, padMode, adjust_pads);
+
+        numOutput = params.get<int>("num_output");
+        int ngroups = params.get<int>("group", 1);
+        CV_Assert(numOutput % ngroups == 0);
+
+        input_zp = params.get<int>("input_zeropoint");
+        output_zp = params.get<int>("zeropoints");
+        output_sc = params.get<float>("scales");
+
+        if (kernel_size.size() == 2) {
+            kernel = Size(kernel_size[1], kernel_size[0]);
+            stride = Size(strides[1], strides[0]);
+            for (int i = 0; i < pads_begin.size(); i++) {
+                if (pads_begin[i] != pads_end[i])
+                    CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
+            }
+            pad = Size(pads_begin[1], pads_begin[0]);
+            dilation = Size(dilations[1], dilations[0]);
+
+            adjustPad.height = adjust_pads[0];
+            adjustPad.width = adjust_pads[1];
+        }
+
+        for (int i = 0; i < adjust_pads.size(); i++) {
+            CV_Assert(adjust_pads[i] < strides[i]);
+        }
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        // blobs[0] - Weights (INT8)
+        // blobs[1] - Biases (INT32)
+        // blobs[2] - Multipliers for convolution output stage (FP32)
+        CV_Assert(!inputs.empty() && blobs.size() == 3);
+        MatSize weightShape = blobs[0].size;
+
+        CV_Assert(inputs[0].dims == outputs[0].dims);
+        if (weightShape.dims() == 3)
+        {
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+            dilations.assign(1, dilations[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
+        }
+        CV_Assert(weightShape.dims() == kernel_size.size() + 2);
+        for (int i = 0; i < kernel_size.size(); i++) {
+            CV_Assert(weightShape[i + 2] == kernel_size[i]);
+        }
+
+        const Mat &input = inputs[0];
+        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && input.type() == CV_8S);
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            CV_Assert(inputs[i].type() == input.type());
+            CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
+            for (int j = 0; j < inputs[i].dims; j++) {
+                CV_Assert(inputs[i].size[j] == input.size[j]);
+            }
+        }
+
+        std::vector<int> inpShape;
+        std::vector<int> outShape;
+        for (int i = 2; i < inputs[0].dims; i++) {
+            inpShape.push_back(inputs[0].size[i]);
+            outShape.push_back(outputs[0].size[i]);
+        }
+        getConvPoolPaddings(inpShape, kernel_size, strides, padMode, pads_begin, pads_end);
+        if (pads_begin.size() == 2) {
+            for (int i = 0; i < pads_begin.size(); i++) {
+                if (pads_begin[i] != pads_end[i])
+                    CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding in convolution layer");
+            }
+            pad = Size(pads_begin[1], pads_begin[0]);
+        }
+    }
+
+    virtual MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const = 0;
+    bool is1x1() const
+    {
+        return (kernel.height == 1 && kernel.width == 1) &&
+               (stride.height == 1 && stride.width == 1) &&
+               (dilation.height == 1 && dilation.width == 1);
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Mat w, b;
+        top->getScaleShift(w, b);
+        if (w.empty() && b.empty())
+            return false;
+
+        CV_Assert((w.empty() || w.type() == CV_32F) &&
+                  (b.empty() || b.type() == CV_32F));
+
+        float new_sc;
+        int new_zp;
+        top->getScaleZeropoint(new_sc, new_zp);
+        fuseWeights(w, b, new_sc);
+        output_sc = new_sc;
+        output_zp = new_zp;
+        return true;
+    }
+
+    virtual void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) = 0;
+};
+
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
+{
+public:
+    enum { VEC_ALIGN = 32, DFT_TYPE = CV_8S };
+    Mat weightsMat;
+    std::vector<int> biasvec;
+    Mat outputMultiplier;
+    Mat activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+
+    ConvolutionLayerInt8Impl(const LayerParams &params) : BaseConvolutionLayerInt8Impl(params){}
+
+    MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        int dims = inpShape.size();
+        int inpD = dims == 5 ? inpShape[2] : 1;
+        int inpH = inpShape[dims - 2];
+        int inpW = inpShape.back();
+        int inpGroupCn = blobs[0].size[1];
+        int ksize = inpGroupCn * std::accumulate(kernel_size.begin(), kernel_size.end(),
+                                                 1, std::multiplies<size_t>());
+        return shape(inpD * inpH * inpW, ksize);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        size_t ksize = kernel_size.size();
+        // Only default backend and Conv1D/Conv2D/Conv3D are supported
+        return backendId == DNN_BACKEND_OPENCV && ksize >= 1 && ksize <= 3;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(!blobs.empty());
+        const int* weightShape = blobs[0].size.p;
+        CV_Assert(blobs[1].total() == (size_t)weightShape[0]);
+
+        internals.clear();
+
+        CV_Assert(inputs.size() != 0);
+        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
+
+        int outCn = weightShape[0];
+        std::vector<int> outShape;
+        outShape.push_back(inputs[0][0]);
+        outShape.push_back(outCn);
+
+        int inpCn = inputs[0][1];
+        if (padMode.empty())
+        {
+            for (int i = 0; i < inpShape.size(); i++)
+                outShape.push_back((inpShape[i] + pads_begin[i] + pads_end[i] - dilations[i] * (kernel_size[i] - 1) - 1) / strides[i] + 1);
+        }
+        else
+        {
+            getConvPoolOutParams(inpShape, kernel_size, strides, padMode, dilations, outShape);
+        }
+
+        int ngroups = inpCn / weightShape[1];
+        if (ngroups == 0 || ngroups * weightShape[1] != inpCn)
+            CV_Error(Error::StsError, format("Number of input channels should "
+                     "be multiple of %d but got %d", weightShape[1], inpCn));
+        CV_Assert(ngroups > 0 && inpCn % ngroups == 0 && outCn % ngroups == 0);
+
+        outputs.resize(1, outShape);
+
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        BaseConvolutionLayerInt8Impl::finalize(inputs_arr, outputs_arr);
+
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        // prepare weightsMat where each row is aligned and has enough zero padding on the right to
+        // use vectorized (i.e. with intrinsics) loops without tail processing
+        Mat wm = blobs[0].reshape(1, numOutput);
+        if( wm.step1() % VEC_ALIGN != 0 )
+        {
+            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
+            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
+            Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
+            wm_padding.setTo(Scalar::all(0));
+            Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
+            wm.copyTo(wm_aligned);
+            wm = wm_aligned;
+        }
+        weightsMat = wm;
+
+        Mat biasMat = blobs[1];
+        biasvec.resize(numOutput+2);
+        for(int i = 0; i < numOutput; i++ )
+            biasvec[i] = biasMat.at<int>(i);
+
+        outputMultiplier = blobs[2];
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
+            return true;
+        }
+        return false;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        return BaseConvolutionLayerInt8Impl::tryFuse(top);
+    }
+
+    void fuseWeights(const Mat& w_, const Mat& b_, const float& new_sc) CV_OVERRIDE
+    {
+        const int outCn = weightsMat.size[0];
+        Mat w = w_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(w_.at<float>(0))) : w_;
+        Mat b = b_.total() == 1 ? Mat(1, outCn, CV_32F, Scalar(b_.at<float>(0))) : b_;
+        CV_Assert_N(!weightsMat.empty(), biasvec.size() == outCn + 2,
+                    w.empty() || outCn == w.total(), b.empty() || outCn == b.total());
+
+        for (int i = 0; i < outCn; ++i)
+        {
+            float off = outputMultiplier.at<float>(i) * output_sc;
+            if (!w.empty())
+                off *= w.at<float>(i);
+
+            if (!b.empty())
+                biasvec[i] += (int)std::round(b.at<float>(i)/off);
+
+            outputMultiplier.at<float>(i) = off/new_sc;
+        }
+        biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
+    }
+
+    class ParallelConv : public cv::ParallelLoopBody
+    {
+    public:
+        enum { BLK_SIZE = 32, BLK_SIZE_CN = 64 };
+
+        const Mat* input_;
+        const Mat* weights_;
+        Mat* output_;
+        int outShape[4]; // used only for conv2d
+        std::vector<size_t> kernel_size, pads_begin, pads_end, strides, dilations;
+        int ngroups_, nstripes_;
+        std::vector<int> ofstab_;
+        const std::vector<int>* biasvec_;
+        const Mat* activLUT_;
+        const ActivationLayerInt8* activ_;
+        bool is1x1_;
+        bool useAVX2;
+        bool useAVX512;
+        int blk_size_cn;
+        int inpZp, outZp;
+        const float* multiplier;
+
+        ParallelConv()
+            : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
+              biasvec_(0), activLUT_(0), activ_(0), is1x1_(false), useAVX2(false), useAVX512(false)
+            , blk_size_cn(0), inpZp(0), outZp(0), multiplier(0)
+        {}
+
+        static void run( const Mat& input, Mat& output, const Mat& weights, const Mat& multipliers,
+                         const std::vector<int>& biasvec, const Mat& activLUT,
+                         const std::vector<size_t>& kernel_size, const std::vector<size_t>& strides,
+                         const std::vector<size_t>& pads_begin, const std::vector<size_t>& pads_end,
+                         const std::vector<size_t>& dilations,
+                         const ActivationLayerInt8* activ, int ngroups, int nstripes, int inp_Zp, int out_Zp)
+        {
+            size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
+                                           1, std::multiplies<size_t>());
+            bool isConv1D = input.dims == 3;
+            bool isConv2D = input.dims == 4;
+            bool isConv3D = input.dims == 5;
+            CV_CheckEQ(static_cast<int>(kernel_size.size()), input.dims - 2, "");
+            CV_Assert_N(input.dims == output.dims,
+                       input.size[0] == output.size[0],
+                       weights.rows == output.size[1],
+                       weights.cols == (input.size[1]/ngroups)*karea,
+                       input.type() == CV_8SC1,
+                       output.type() == CV_32SC1,
+                       input.type() == weights.type(),
+                       input.isContinuous(),
+                       output.isContinuous(),
+                       biasvec.size() == (size_t)output.size[1]+2);
+            CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, "");
+            ParallelConv p;
+
+            p.input_ = &input;
+            p.weights_ = &weights;
+            p.output_ = &output;
+            int max_ind = isConv1D? 3: 4;
+            for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i];
+            p.outShape[1] /= ngroups;
+
+            p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations;
+            p.pads_begin = pads_begin; p.pads_end = pads_end;
+
+            p.ngroups_ = ngroups;
+            p.nstripes_ = nstripes;
+
+            int inpCnAll = input.size[1];
+            int depth = (input.dims == 5) ? input.size[2] : 1;
+            int width = input.size[input.dims - 1];
+            int height = isConv1D? 1 : input.size[input.dims - 2];
+            int inpCn = inpCnAll / ngroups;
+
+            p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
+                       pads_begin[0] == 0  && pads_begin[1] == 0) ||
+                       (isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1);
+
+            p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
+
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            int blk_size_cn0 = cvCeil(1600./(kernel_w*kernel_h));
+            int ncn = 32;
+            while (ncn*2 < blk_size_cn0 && ncn < inpCn)
+                ncn *= 2;
+            ncn = std::min(ncn, inpCn);
+            p.blk_size_cn = ncn;
+
+            int dil_d = isConv3D? dilations[0] : 1;
+            int dil_h = isConv1D? 1 : dilations[dilations.size() - 2];
+            int dil_w = dilations.back();
+
+            p.inpZp = inp_Zp;
+            p.outZp = out_Zp;
+            p.multiplier = multipliers.ptr<float>(0);
+
+            p.ofstab_.resize(karea * ncn);
+            int* ofstab = &p.ofstab_[0];
+
+            if (isConv1D)
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for( int k_c = 0; k_c < kernel_w; k_c++ )
+                        ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w;
+            }
+            else if (isConv2D)
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for( int k_r = 0; k_r < kernel_h; k_r++ )
+                        for( int k_c = 0; k_c < kernel_w; k_c++ )
+                            ofstab[(k*kernel_h + k_r)*kernel_w + k_c] =
+                                   (k*height + k_r*dil_h)*width + k_c*dil_w;
+            }
+            else
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for (int k_d = 0; k_d < kernel_d; k_d++)
+                        for( int k_r = 0; k_r < kernel_h; k_r++ )
+                            for( int k_c = 0; k_c < kernel_w; k_c++ )
+                                ofstab[(k*kernel_d*kernel_h + k_d*kernel_h + k_r)*kernel_w + k_c] =
+                                       (k*depth*height + k_d*dil_d*height + k_r*dil_h)*width + k_c*dil_w;
+            }
+
+            p.biasvec_ = &biasvec;
+            p.activLUT_ = &activLUT;
+            p.activ_ = !activLUT.empty() ? activ : 0;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        virtual void operator ()(const Range &r0) const CV_OVERRIDE
+        {
+            const int valign = ConvolutionLayerInt8Impl::VEC_ALIGN;
+            int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
+            bool isConv1D = input_->dims == 3;
+            bool isConv2D = input_->dims == 4;
+            bool isConv3D = input_->dims == 5;
+
+            int outW = output_->size[output_->dims - 1];
+            int outH = isConv1D? 1 : output_->size[output_->dims - 2];
+            int outCn = output_->size[1]/ngroups;
+
+            int depth = isConv3D? input_->size[2] : 1;
+            int height = isConv1D? 1 : input_->size[input_->dims - 2];
+            int width = input_->size[input_->dims - 1];
+            int inpCn = input_->size[1]/ngroups;
+
+            const int nstripes = nstripes_;
+
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+            int karea = kernel_w*kernel_h*kernel_d;
+
+            int pad_d = isConv3D? pads_begin[0] : 0;
+            int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2];
+            int pad_l = pads_begin.back();
+
+            int stride_d = isConv3D? strides[0] : 0;
+            int stride_h = isConv1D? 0 : strides[strides.size() - 2];
+            int stride_w = strides.back();
+
+            int dilation_d = isConv3D? dilations[0] : 1;
+            int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2];
+            int dilation_w = dilations.back();
+
+            int i, j, k, d;
+            int inpPlaneSize = (int)input_->total(2);
+            int outPlaneSize = (int)output_->total(2);
+            bool is1x1 = is1x1_;
+
+            int stripesPerSample;
+            int stripeSize;
+            Range r = r0;
+            bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
+                outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
+                width >= 16 + dilation_w*(kernel_w - 1);
+            // for now only 3x3 depth-wise convolutions are supported
+            depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
+                // computing at most 1 pixel from each side can involve padding
+                max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
+                pad_l <= 1 && pad_t <= 1;
+
+            if( !depthWiseConvolution && nstripes >= batchSize*2 )
+            {
+                stripesPerSample = nstripes/batchSize;
+                stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, 8);
+                stripeSize = std::min(stripeSize, outPlaneSize);
+            }
+            else
+            {
+                stripesPerSample = 1;
+                int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1);
+                r.start *= samplesPerStripe;
+                r.end *= samplesPerStripe;
+                stripeSize = outPlaneSize;
+            }
+
+            const int8_t* data_inp0_ = input_->ptr<int8_t>();
+            const int* ofstab = &ofstab_[0];
+            const int8_t* wptr_orig_ = weights_->ptr<int8_t>();
+            size_t wstep = weights_->step1();
+            const int* biasptr_ = &biasvec_->at(0);
+            const int* lutptr_ = !activLUT_->empty() ? activLUT_->ptr<int>() : 0;
+            int* data_out0_ = output_->ptr<int>();
+            AutoBuffer<int8_t> rowbuf0_;
+            int8_t* rowbuf0 = 0;
+            bool use_rowbuf = !depthWiseConvolution;
+            int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);
+
+            // im2row buffer is not used for depth-wise convolution
+            if(use_rowbuf)
+            {
+                size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
+                //printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
+                rowbuf0_.allocate(rowbufsz + valign);
+                rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(int8_t)));
+                // we clear the buffer once; ultimately, it lets us to avoid
+                // tail processing after running the unrolled/vectorized loop.
+                // the main idea is to make sure that the tail (a.k.a. padding) of each row
+                // (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
+                // does not contain NaNs or Infs. Because the padding in the weights
+                // matrix is explicitly initialized with 0's, we handle all other
+                // cases nicely, i.e. we can skip expliciting re-initialization
+                // of the padding - we just retain elements from the previous iteration
+                // of the loop over channels (cn0).
+                memset(rowbuf0, (int8_t)inpZp, rowbufsz*sizeof(rowbuf0[0]) );
+            }
+
+            for( int stripe = r.start; stripe < r.end; stripe++ )
+            {
+                int subsampleIdx = stripe/stripesPerSample;
+                if( subsampleIdx >= batchSize )
+                    break;
+                int stripeStart = (int)((stripe - subsampleIdx*stripesPerSample)*stripeSize);
+                int stripeEnd = (int)std::min(stripeStart + stripeSize, outPlaneSize);
+                const int8_t* data_inp0 = data_inp0_ + subsampleIdx*inpPlaneSize*inpCn;
+                int* data_out0 = data_out0_ + subsampleIdx*outPlaneSize*outCn;
+                int startOutCn = (subsampleIdx % ngroups)*outCn;
+                const int8_t* wptr_orig = wptr_orig_ + wstep*startOutCn;
+                const int* biasptr = biasptr_ + startOutCn;
+                const float* multptr = multiplier + startOutCn;
+
+                for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
+                {
+                    int cn1 = std::min(cn0 + blk_size_cn, inpCn);
+                    int ncn = cn1 - cn0, vsz = karea*ncn;
+                    int vsz_a = (int)alignSize(vsz, valign);
+                    const int8_t* wptr = wptr_orig + cn0*karea;
+
+                    for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
+                    {
+                        int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
+                        int bsz = ofs1 - ofs0;
+
+                        int out_d = ofs0 / (outH * outW);
+                        int out_i = (ofs0 - out_d * outH * outW) / outW;
+                        int out_j = ofs0 % outW;
+
+                        if (depthWiseConvolution)
+                        {
+                            CV_Assert(out_i == 0 && out_j == 0);
+                            int in_d = out_d * stride_d - pad_d;
+                            const int8_t* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
+                            int* outptr_ = data_out0 + ofs0;
+
+                        #if CV_TRY_AVX2
+                            if(useAVX2)
+                                opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
+                            else
+                        #endif
+                            {
+                                const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                                             w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                                             w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+                                int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+                                int bias = biasptr[out_d], biasCopy;
+                                float mult = multptr[out_d];
+
+                                for (int out_i = 0; out_i < outH; out_i++)
+                                {
+                                    int in_i = out_i * stride_h - pad_t, out_j = 0;
+                                    const int8_t* imgptr0 = inptr_ + in_i*width;
+                                    const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+                                    const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+                                    int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+                                    int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+                                    int out, out1;
+                                    // Bias has a fused offset component. bias = bias_quantized - input_zeropoint*sum_of_weights.
+                                    // In some cases below, certain weights are not used for convolution or set to zero.
+                                    // So we create a copy of bias at the start and remove the weight's components as necessary.
+                                    biasCopy = bias;
+
+                                    if (in_i < 0)
+                                    {
+                                        biasCopy += inpZp * (w00 + w01 + w02);
+                                        w00 = w01 = w02 = 0;
+                                        imgptr0 = imgptr1;
+                                    }
+                                    else if (in_i + dilation_h*(kernel_h-1) >= height)
+                                    {
+                                        biasCopy += inpZp * (w20 + w21 + w22);
+                                        w20 = w21 = w22 = 0;
+                                        imgptr2 = imgptr1;
+                                    }
+                                    int* outptr = outptr_ + out_i*outW;
+                                    if (pad_l > 0)
+                                    {
+                                        out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                                              (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                                              (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                                              biasCopy + inpZp*(w00 + w10 + w20);
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[0] = std::min(std::max(out1, -128), 127);
+                                        out_j = 1;
+                                    }
+                                #if CV_SIMD
+                                    if( stride_w == 1 )
+                                    {
+                                        const int out_delta = 16;
+                                        v_int8x16 vw00 = v_setall_s8(w00), vw01 = v_setall_s8(w01), vw02 = v_setall_s8(w02),
+                                                  vw10 = v_setall_s8(w10), vw11 = v_setall_s8(w11), vw12 = v_setall_s8(w12),
+                                                  vw20 = v_setall_s8(w20), vw21 = v_setall_s8(w21), vw22 = v_setall_s8(w22);
+                                        v_int32x4 vout0, vout1, vout2, vout3, vbias = v_setall_s32(biasCopy), voutzp = v_setall_s32(outZp),
+                                                  outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                                        v_float32x4 vmult = v_setall_f32(mult);
+                                        for( ; out_j < outW1; out_j += out_delta )
+                                        {
+                                            if (out_j + out_delta > outW1)
+                                            {
+                                                if (out_j <= pad_l)
+                                                    break;
+                                                out_j = outW1 - out_delta;
+                                            }
+                                            int in_j = out_j * stride_w - pad_l;
+                                            v_int8x16 v00 = v_load(imgptr0 + in_j),
+                                                      v01 = v_load(imgptr0 + in_j + dilation_w),
+                                                      v02 = v_load(imgptr0 + in_j + dilation_w*2),
+                                                      v10 = v_load(imgptr1 + in_j),
+                                                      v11 = v_load(imgptr1 + in_j + dilation_w),
+                                                      v12 = v_load(imgptr1 + in_j + dilation_w*2),
+                                                      v20 = v_load(imgptr2 + in_j),
+                                                      v21 = v_load(imgptr2 + in_j + dilation_w),
+                                                      v22 = v_load(imgptr2 + in_j + dilation_w*2);
+
+                                            vout0 = vout1 = vout2 = vout3 = vbias;
+                                            v_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                                            v_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                                            vout0 = voutzp + v_round(v_cvt_f32(vout0)*vmult);
+                                            vout1 = voutzp + v_round(v_cvt_f32(vout1)*vmult);
+                                            vout2 = voutzp + v_round(v_cvt_f32(vout2)*vmult);
+                                            vout3 = voutzp + v_round(v_cvt_f32(vout3)*vmult);
+
+                                            vout0 = v_min(v_max(vout0, outmin), outmax);
+                                            vout1 = v_min(v_max(vout1, outmin), outmax);
+                                            vout2 = v_min(v_max(vout2, outmin), outmax);
+                                            vout3 = v_min(v_max(vout3, outmin), outmax);
+
+                                            v_store(outptr + out_j, vout0);
+                                            v_store(outptr + out_j + 4, vout1);
+                                            v_store(outptr + out_j + 8, vout2);
+                                            v_store(outptr + out_j + 12, vout3);
+                                        }
+                                    }
+                                #endif
+                                    for (; out_j < outW1; out_j++)
+                                    {
+                                        int in_j = out_j * stride_w - pad_l;
+                                        out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                                              (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                                              (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[out_j] = std::min(std::max(out1, -128), 127);
+                                    }
+
+                                    for (; out_j < outW; out_j++ )
+                                    {
+                                        int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+                                        int s0 = 1, s1 = 1, s2 = 1;
+                                        if (in_j0 >= width)
+                                        {
+                                            in_j0 = 0;
+                                            s0 = 0;
+                                            biasCopy += inpZp*(w00 + w10 + w20);
+                                        }
+                                        if (in_j1 >= width)
+                                        {
+                                            in_j1 = 0;
+                                            s1 = 0;
+                                            biasCopy += inpZp*(w01 + w11 + w21);
+                                        }
+                                        if (in_j2 >= width)
+                                        {
+                                            in_j2 = 0;
+                                            s2 = 0;
+                                            biasCopy += inpZp*(w02 + w12 + w22);
+                                        }
+                                        out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                                              (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                                              (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+                                        out1 = outZp + (int)std::round(out*mult);
+                                        outptr[out_j] = std::min(std::max(out1, -128), 127);
+                                    }
+                                }
+                            }
+                            continue;
+                        }
+                        // do im2row for a part of input tensor
+                        int8_t* rowbuf = rowbuf0;
+
+                        if (isConv1D)
+                        {
+                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + cn0*width + in_j;
+                                ofs += delta;
+
+                                // do im2row for a part of input tensor
+                                if( is1x1 )
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                            rowbuf[k] = imgptr[k*inpPlaneSize];
+                                    }
+                                }
+                                else
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                    {
+                                        // this condition should be true for most of the tensor elements, i.e.
+                                        // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                        if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                        {
+                                            for( k = 0; k < vsz; k++ )
+                                            {
+                                                int k1 = ofstab[k];
+                                                int8_t v0 = imgptr[k1];
+                                                int8_t v1 = imgptr[k1 + stride_w];
+                                                rowbuf[k] = v0;
+                                                rowbuf[k+vsz_a] = v1;
+                                            }
+                                            out_j++;
+                                            rowbuf += vsz_a;
+                                            imgptr += stride_w;
+                                            in_j += stride_w;
+                                        }
+                                        else
+                                        {
+                                            int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                            int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                            // here some non-continuous sub-row of the row will not be
+                                            // filled from the tensor; we need to make sure that the uncovered
+                                            // elements are explicitly set to 0's. the easiest way is to
+                                            // set all the elements to 0's before the loop.
+                                            memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                            for( k = 0; k < ncn; k++ )
+                                            {
+                                                for( i = i0; i < i1; i++ )
+                                                {
+                                                    int imgofs = k*width + i*dilation_w;
+                                                    rowbuf[k*kernel_w + i] = imgptr[imgofs];
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else if (isConv2D)
+                        {
+                            if( is1x1 && stride_w == 1 && stride_h == 1 )
+                            {
+                                const int8_t* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
+                                for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
+                                {
+                                    if( j + 4 <= bsz )
+                                    {
+                                        k = 0;
+                                        for( ; k < vsz; k++ )
+                                        {
+                                            const int8_t* inp = imgptr + j + k*inpPlaneSize;
+                                            int8_t v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
+                                            rowbuf[k] = v0;
+                                            rowbuf[k + vsz_a] = v1;
+                                            rowbuf[k + vsz_a*2] = v2;
+                                            rowbuf[k + vsz_a*3] = v3;
+                                        }
+                                        j += 3;
+                                        rowbuf += vsz_a*3;
+                                    }
+                                    else
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                        {
+                                            rowbuf[k] = imgptr[j + k*inpPlaneSize];
+                                        }
+                                    }
+                                }
+                            }
+                            else
+                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_i = out_i * stride_h - pad_t;
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + (cn0*height + in_i)*width + in_j;
+                                ofs += delta;
+
+                                // do im2row for a part of input tensor
+                                if( is1x1 )
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                            rowbuf[k] = imgptr[k*inpPlaneSize];
+                                    }
+                                }
+                                else
+                                {
+                                    bool ok_i = 0 <= in_i && in_i < height - (kernel_h-1)*dilation_h;
+                                    int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                    int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                    {
+                                        // this condition should be true for most of the tensor elements, i.e.
+                                        // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                        if( ok_i && out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                        {
+                                            for( k = 0; k < vsz; k++ )
+                                            {
+                                                int k1 = ofstab[k];
+                                                int8_t v0 = imgptr[k1];
+                                                int8_t v1 = imgptr[k1 + stride_w];
+                                                rowbuf[k] = v0;
+                                                rowbuf[k+vsz_a] = v1;
+                                            }
+                                            out_j++;
+                                            rowbuf += vsz_a;
+                                            imgptr += stride_w;
+                                            in_j += stride_w;
+                                        }
+                                        else
+                                        {
+                                            int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                            int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                            // here some non-continuous sub-row of the row will not be
+                                            // filled from the tensor; we need to make sure that the uncovered
+                                            // elements are explicitly set to 0's. the easiest way is to
+                                            // set all the elements to 0's before the loop.
+                                            memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                            for( k = 0; k < ncn; k++ )
+                                            {
+                                                for( i = i0; i < i1; i++ )
+                                                {
+                                                    for( j = j0; j < j1; j++ )
+                                                    {
+                                                        int imgofs = k*(width*height) + i*(dilation_h*width) + j*dilation_w;
+                                                        rowbuf[(k*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for( ofs = ofs0; ofs < ofs1; out_d += (out_i + 1) / outH, out_i = (out_i + 1) % outH, out_j = 0 )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_d = out_d * stride_d - pad_d;
+                                int in_i = out_i * stride_h - pad_t;
+                                int in_j = out_j * stride_w - pad_l;
+                                const int8_t* imgptr = data_inp0 + (cn0*depth*height + in_d*height + in_i)*width + in_j;
+                                ofs += delta;
+
+                                int d0 = std::max(0, (-in_d + dilation_d - 1) / dilation_d);
+                                int d1 = std::min(kernel_d, (depth - in_d + dilation_d - 1) / dilation_d);
+
+                                int i0 = std::max(0, (-in_i + dilation_h-1)/dilation_h);
+                                int i1 = std::min(kernel_h, (height - in_i + dilation_h-1)/dilation_h);
+
+                                for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                {
+                                    int j0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                    int j1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                    // here some non-continuous sub-row of the row will not be
+                                    // filled from the tensor; we need to make sure that the uncovered
+                                    // elements are explicitly set to 0's. the easiest way is to
+                                    // set all the elements to 0's before the loop.
+                                    memset(rowbuf, (int8_t)inpZp, vsz*sizeof(rowbuf[0]));
+                                    for( k = 0; k < ncn; k++ )
+                                    {
+                                        for ( d = d0; d < d1; d++)
+                                        {
+                                            for( i = i0; i < i1; i++ )
+                                            {
+                                                for( j = j0; j < j1; j++ )
+                                                {
+                                                    int imgofs = k*(depth*width*height) + d*dilation_d*width*height + i*(dilation_h*width) + j*dilation_w;
+                                                    rowbuf[(k*kernel_d*kernel_h + d*kernel_h + i)*kernel_w + j] = imgptr[imgofs];
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        // now compute dot product of the weights
+                        // and im2row-transformed part of the tensor
+                    #if CV_TRY_AVX512_SKX
+                        if(useAVX512)
+                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
+                    #if CV_TRY_AVX2
+                        if(useAVX2)
+                            opt_AVX2::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                          outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 == 0, cn1 == inpCn);
+                        else
+                    #endif
+                        for( int i = 0; i < outCn; i += 2 )
+                        {
+                            const int8_t* wptr0 = wptr + i*wstep;
+                            const int8_t* wptr1 = wptr0 + wstep;
+                            int* outptr0 = data_out0 + ofs0 + i*outPlaneSize;
+                            int* outptr1 = outptr0 + outPlaneSize;
+                            int bias0 = biasptr[i], bias1 = biasptr[i+1];
+                            float mult0 = multptr[i], mult1 = multptr[i+1];
+
+                            if( i+1 >= outCn )
+                            {
+                                wptr1 = wptr0;
+                                outptr1 = outptr0;
+                                bias1 = bias0;
+                                mult1 = mult0;
+                            }
+                            int j = 0;
+                        #if CV_SIMD128
+                            v_int32x4 voutzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                            v_float32x4 vmult0 = v_setall_f32(mult0), vmult1 = v_setall_f32(mult1);
+                            for( ; j <= bsz - 4; j += 4 )
+                            {
+                                const int8_t* rptr = rowbuf0 + j*vsz_a;
+                                v_int32x4 s0, s1;
+
+                                if( cn0 == 0 )
+                                {
+                                    s0 = v_setall_s32(bias0);
+                                    s1 = v_setall_s32(bias1);
+                                }
+                                else
+                                {
+                                    s0 = v_load(outptr0 + j);
+                                    s1 = v_load(outptr1 + j);
+                                }
+
+                                v_int32x4 vs00 = v_setzero_s32(), vs01 = v_setzero_s32(),
+                                          vs02 = v_setzero_s32(), vs03 = v_setzero_s32(),
+                                          vs10 = v_setzero_s32(), vs11 = v_setzero_s32(),
+                                          vs12 = v_setzero_s32(), vs13 = v_setzero_s32();
+                                for( k = 0; k < vsz; k += 16, rptr += 16 )
+                                {
+                                    v_int8x16 w0 = v_load_aligned(wptr0 + k);
+                                    v_int8x16 w1 = v_load_aligned(wptr1 + k);
+                                    v_int8x16 r0 = v_load_aligned(rptr);
+                                    v_int8x16 r1 = v_load_aligned(rptr + vsz_a);
+                                    v_int8x16 r2 = v_load_aligned(rptr + vsz_a*2);
+                                    v_int8x16 r3 = v_load_aligned(rptr + vsz_a*3);
+
+                                    vs00 = v_dotprod_expand_fast(w0, r0, vs00);
+                                    vs01 = v_dotprod_expand_fast(w0, r1, vs01);
+                                    vs02 = v_dotprod_expand_fast(w0, r2, vs02);
+                                    vs03 = v_dotprod_expand_fast(w0, r3, vs03);
+
+                                    vs10 = v_dotprod_expand_fast(w1, r0, vs10);
+                                    vs11 = v_dotprod_expand_fast(w1, r1, vs11);
+                                    vs12 = v_dotprod_expand_fast(w1, r2, vs12);
+                                    vs13 = v_dotprod_expand_fast(w1, r3, vs13);
+                                }
+                                s0 += v_int32x4(v_reduce_sum(vs00), v_reduce_sum(vs01), v_reduce_sum(vs02), v_reduce_sum(vs03));
+                                s1 += v_int32x4(v_reduce_sum(vs10), v_reduce_sum(vs11), v_reduce_sum(vs12), v_reduce_sum(vs13));
+                                if( cn1 == inpCn )
+                                {
+                                    s0 = voutzp + v_round(v_cvt_f32(s0)*vmult0);
+                                    s1 = voutzp + v_round(v_cvt_f32(s1)*vmult1);
+
+                                    s0 = v_min(v_max(s0, outmin), outmax);
+                                    s1 = v_min(v_max(s1, outmin), outmax);
+                                }
+                                v_store(outptr0 + j, s0);
+                                v_store(outptr1 + j, s1);
+                            }
+                        #endif
+                            for( ; j < bsz; j++ )
+                            {
+                                const int8_t* rptr = rowbuf0 + j*vsz_a;
+                                int s00, s10;
+
+                                if( cn0 == 0 )
+                                {
+                                    s00 = bias0;
+                                    s10 = bias1;
+                                }
+                                else
+                                {
+                                    s00 = outptr0[j];
+                                    s10 = outptr1[j];
+                                }
+
+                                for( k = 0; k < vsz; k++ )
+                                {
+                                    int8_t r0 = rptr[k];
+                                    s00 += (int)wptr0[k] * r0;
+                                    s10 += (int)wptr1[k] * r0;
+                                }
+                                if( cn1 == inpCn )
+                                {
+                                    int out0 = outZp + (int)std::round(s00*mult0);
+                                    int out1 = outZp + (int)std::round(s10*mult1);
+
+                                    s00 = std::min(std::max(out0, -128), 127);
+                                    s10 = std::min(std::max(out1, -128), 127);
+                                }
+
+                                outptr0[j] = s00;
+                                outptr1[j] = s10;
+                            }
+                        }
+                    }
+                }
+                if( activ_ )
+                    activ_->forwardSlice(data_out0 + stripeStart, lutptr_,
+                                         data_out0 + stripeStart, (int)(stripeEnd - stripeStart),
+                                         outPlaneSize, startOutCn, startOutCn + outCn);
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+#if CV_SSE3
+        uint32_t ftzMode = _MM_GET_FLUSH_ZERO_MODE();
+        uint32_t dazMode = _MM_GET_DENORMALS_ZERO_MODE();
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        /*if (inputs[0].dims > 3) {
+            printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }
+        else {
+            printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }*/
+
+        int inpGroupCn = blobs[0].size[1];
+        CV_Assert_N(inputs.size() == (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
+                    outputs.size() == 1, inputs[0].data != outputs[0].data);
+
+        int ngroups = inputs[0].size[1] / inpGroupCn;
+        CV_Assert(outputs[0].size[1] % ngroups == 0);
+
+        int nstripes = std::max(getNumThreads(), 1);
+        Mat outputInt32 = Mat(shape(outputs[0]), CV_32S);
+
+        ParallelConv::run(inputs[0], outputInt32, weightsMat, outputMultiplier, biasvec, activationLUT, kernel_size, strides,
+                          pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes, input_zp, output_zp);
+
+        outputInt32.convertTo(outputs[0], CV_8S);
+
+#if CV_SSE3
+        _MM_SET_FLUSH_ZERO_MODE(ftzMode);
+        _MM_SET_DENORMALS_ZERO_MODE(dazMode);
+#endif
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == outputs.size());
+
+        int64 flops = 0;
+        int karea = std::accumulate(kernel_size.begin(), kernel_size.end(), 1, std::multiplies<size_t>());
+        for (int i = 0; i < outputs.size(); i++)
+        {
+            flops += total(outputs[i])*(CV_BIG_INT(2)*karea*inputs[i][1] + 1);
+        }
+        return flops;
+    }
+};
+
+Ptr<BaseConvolutionLayer> ConvolutionLayerInt8::create(const LayerParams &params)
+{
+    return Ptr<BaseConvolutionLayer>(new ConvolutionLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/elementwise_layers.cpp b/modules/dnn/src/int8layers/elementwise_layers.cpp
new file mode 100644
index 000000000000..75118b6bc123
--- /dev/null
+++ b/modules/dnn/src/int8layers/elementwise_layers.cpp
@@ -0,0 +1,190 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+#include <iostream>
+
+namespace cv
+{
+namespace dnn
+{
+
+class ActivationLayerInt8Impl CV_FINAL : public ActivationLayerInt8
+{
+public:
+    ActivationLayerInt8Impl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        activationLUT = !blobs.empty() ? blobs[0] : Mat();
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    class Activation : public cv::ParallelLoopBody
+    {
+    public:
+        const Mat* src;
+        const Mat* lut;
+        Mat* dst;
+        int nstripes;
+
+        Activation() : src(0), lut(0), dst(0), nstripes(0){}
+
+        static void run(const Mat& src, const Mat& lut, Mat& dst, int nstripes)
+        {
+            Activation p;
+
+            p.src = &src;
+            p.lut = &lut;
+            p.dst = &dst;
+            p.nstripes = nstripes;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range &r) const CV_OVERRIDE
+        {
+            const int8_t* table = lut->ptr<int8_t>();
+            int nsamples = 1, outCn = 1;
+            size_t planeSize = 1;
+
+            if (src->dims > 1)
+            {
+                nsamples = src->size[0];
+                outCn = src->size[1];
+            }
+            else
+                outCn = src->size[0];
+
+            for (int i = 2; i < src->dims; ++i)
+                planeSize *= src->size[i];
+
+            size_t stripeSize = (planeSize + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, planeSize);
+            int len = (int)(stripeEnd - stripeStart);
+
+            for( int i = 0; i < nsamples; i++ )
+            {
+                const int8_t* srcptr = src->ptr<int8_t>(i) + stripeStart;
+                int8_t* dstptr = dst->ptr<int8_t>(i) + stripeStart;
+                for( int cn = 0; cn < outCn; cn++, srcptr += planeSize, dstptr += planeSize )
+                {
+                    int i = 0;
+#if CV_SIMD128
+                    for( ; i <= len - 16; i += 16 )
+                    {
+                        v_int8x16 out(table[srcptr[i] + 128], table[srcptr[i+1] + 128], table[srcptr[i+2] + 128], table[srcptr[i+3] + 128],
+                                      table[srcptr[i+4] + 128], table[srcptr[i+5] + 128], table[srcptr[i+6] + 128], table[srcptr[i+7] + 128],
+                                      table[srcptr[i+8] + 128], table[srcptr[i+9] + 128], table[srcptr[i+10] + 128], table[srcptr[i+11] + 128],
+                                      table[srcptr[i+12] + 128], table[srcptr[i+13] + 128], table[srcptr[i+14] + 128], table[srcptr[i+15] + 128]);
+                        v_store(dstptr + i, out);
+                    }
+#endif
+                    for( ; i < len; i++ )
+                    {
+                        dstptr[i] = table[srcptr[i] + 128];
+                    }
+                }
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            const Mat &src = inputs[i];
+            if (!activationLUT.empty())
+            {
+                const int nstripes = getNumThreads();
+                Mat &dst = outputs[i];
+                CV_Assert(src.size == dst.size && src.type() == dst.type() &&
+                          src.isContinuous() && dst.isContinuous() && src.type() == CV_8S);
+
+                Activation::run(src, activationLUT, dst, nstripes);
+            }
+            else
+            {
+                src.copyTo(outputs[i]);
+            }
+        }
+    }
+
+    void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
+        {
+            int i = 0;
+#if CV_SIMD128
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_int8x16 out(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128],
+                              lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128],
+                              lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128],
+                              lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
+                v_store(dst + i, out);
+            }
+#endif
+            for( ; i < len; i++ )
+                dst[i] = lut[src[i] + 128];
+        }
+    }
+
+    void forwardSlice(const int* src, const int* lut, int* dst, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
+    {
+        for( int cn = cn0; cn < cn1; cn++, src += planeSize, dst += planeSize )
+        {
+            int i = 0;
+#if CV_SIMD128
+            for( ; i <= len - 16; i += 16 )
+            {
+                v_int32x4 out0(lut[src[i] + 128], lut[src[i+1] + 128], lut[src[i+2] + 128], lut[src[i+3] + 128]);
+                v_int32x4 out1(lut[src[i+4] + 128], lut[src[i+5] + 128], lut[src[i+6] + 128], lut[src[i+7] + 128]);
+                v_int32x4 out2(lut[src[i+8] + 128], lut[src[i+9] + 128], lut[src[i+10] + 128], lut[src[i+11] + 128]);
+                v_int32x4 out3(lut[src[i+12] + 128], lut[src[i+13] + 128], lut[src[i+14] + 128], lut[src[i+15] + 128]);
+
+                v_store(dst + i, out0);
+                v_store(dst + i + 4, out1);
+                v_store(dst + i + 8, out2);
+                v_store(dst + i + 12, out3);
+            }
+#endif
+            for( ; i < len; i++ )
+                dst[i] = lut[src[i] + 128];
+        }
+
+    }
+
+    Mat activationLUT;
+};
+
+Ptr<ActivationLayerInt8> ActivationLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<ActivationLayerInt8>(new ActivationLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/eltwise_layer.cpp b/modules/dnn/src/int8layers/eltwise_layer.cpp
new file mode 100644
index 000000000000..be7a32b1efd9
--- /dev/null
+++ b/modules/dnn/src/int8layers/eltwise_layer.cpp
@@ -0,0 +1,577 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class EltwiseLayerInt8Impl CV_FINAL : public EltwiseLayerInt8
+{
+public:
+    enum EltwiseOp
+    {
+        PROD = 0,
+        SUM = 1,
+        MAX = 2
+    } op;
+    std::vector<float> coeffs;
+    std::vector<int> zeropoints;
+
+    enum OutputChannelsMode
+    {
+        ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
+        ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< number of channels of other inputs should not be greater than number of channels of first input
+        ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to number of channels of first input
+                                                 //!< there is restriction on number of channels of other inputs
+                                                 //!< extra channels of other inputs is ignored
+        ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
+                                                 //!< output's number of channels is equal to maximal number of input channels
+                                                 //!< @note supported operation: `SUM`
+    } channelsModeInput;
+
+
+    mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
+    mutable /*size_t*/int outputChannels;
+
+    EltwiseLayerInt8Impl(const LayerParams& params)
+        : outputChannels(0)
+    {
+        setParamsFrom(params);
+        offset = params.get<float>("offset", 0.f);
+        hasVecInput = false;
+        op = SUM;
+        if (params.has("operation"))
+        {
+            String operation = toLowerCase(params.get<String>("operation"));
+            if (operation == "prod")
+                op = PROD;
+            else if (operation == "sum")
+                op = SUM;
+            else if (operation == "max")
+                op = MAX;
+            else
+                CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
+        }
+
+        if (params.has("coeff"))
+        {
+            DictValue paramCoeff = params.get("coeff");
+            int i, n = paramCoeff.size();
+            coeffs.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                coeffs[i] = paramCoeff.get<float>(i);
+            }
+        }
+
+        if (params.has("input_zeropoints"))
+        {
+            DictValue zp = params.get("input_zeropoints");
+            int i, n = zp.size();
+            zeropoints.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                zeropoints[i] = zp.get<int>(i);
+            }
+        }
+
+        channelsModeInput = ELTWISE_CHANNNELS_SAME;
+        if (params.has("output_channels_mode"))
+        {
+            String v = toLowerCase(params.get<String>("output_channels_mode"));
+            if (v == "same")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_SAME;
+            }
+            else if (v == "input_0")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
+            }
+            else if (v == "input_0_truncate")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
+            }
+            else if (v == "max_input_channels")
+            {
+                channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
+                if (op != SUM)
+                    CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
+        }
+        channelsMode = channelsModeInput;
+
+        // TODO Must have checks for other unknown options
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() >= 2);
+        CV_Assert(inputs[0].size() >= 2);
+        CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
+        CV_Assert(op == SUM || op == PROD || coeffs.size() == 0);
+
+        int dims = inputs[0].size();
+        // Number of channels in output shape is determined by the first input tensor.
+        bool variableChannels = false;
+        int numChannels = inputs[0][1];
+        for (size_t i = 1; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
+
+            int input_channels = inputs[i][1];
+            if (numChannels != input_channels)
+                variableChannels = true;
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
+            {
+                CV_Assert(numChannels == input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
+            {
+                CV_Assert(numChannels >= input_channels);
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+            {
+                // nothing to check
+            }
+            else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
+            {
+                numChannels = std::max(numChannels, input_channels);
+            }
+            else
+            {
+                CV_Assert(0 && "Internal error");
+            }
+        }
+
+        channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
+        outputChannels = numChannels;
+
+        outputs.assign(1, inputs[0]);
+        outputs[0][1] = numChannels;
+
+        if (dims > 2)
+        {
+            size_t vecIdx = 0;
+            bool isVecFound = false;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                bool allOnes = isAllOnes(inputs[i], 2, dims);
+                if (!allOnes && !isVecFound)
+                {
+                    vecIdx = i;
+                    isVecFound = true;
+                }
+
+                if (!allOnes && i != vecIdx)
+                {
+                    for (size_t j = 2; j < dims; j++)
+                    {
+                         CV_Assert(inputs[vecIdx][j] == inputs[i][j]);
+                    }
+                }
+            }
+
+            if (channelsModeInput == ELTWISE_CHANNNELS_SAME && isVecFound)
+            {
+                for (size_t j = 2; j < dims; j++)
+                {
+                    outputs[0][j] = inputs[vecIdx][j];
+                }
+            }
+        }
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            MatShape inpShape = shape(inputs[i].size);
+            if (isAllOnes(inpShape, 2, inputs[i].dims))
+            {
+                hasVecInput = true;
+                return;
+            }
+        }
+    }
+
+    class EltwiseInvoker : public ParallelLoopBody
+    {
+        EltwiseLayerInt8Impl& self;
+        std::vector<const Mat*> srcs;
+        std::vector<int> srcNumChannels;
+        int nsrcs;
+        Mat* dst;
+        Mat* buf;
+        std::vector<float> coeffs;
+        std::vector<int> zeropoints;
+        int nstripes;
+        const Mat* activLUT;
+        const ActivationLayerInt8* activ;
+        int channels;
+        size_t planeSize;
+        float offset;
+
+        EltwiseInvoker(EltwiseLayerInt8Impl& self_)
+            : self(self_)
+            , nsrcs(0), dst(0), buf(0), nstripes(0), activ(0), channels(0)
+            , planeSize(0), offset(0)
+        {}
+
+    public:
+        static void run(EltwiseLayerInt8Impl& self,
+                        const Mat* srcs, int nsrcs, Mat& buf, Mat& dst,
+                        int nstripes, float offset)
+        {
+            const EltwiseOp op = self.op;
+            CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_8SC1, ""); CV_Assert(dst.isContinuous());
+            CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
+            CV_CheckGE(nsrcs, 2, "");
+
+            CV_Assert(self.outputChannels == dst.size[1]);
+
+            EltwiseInvoker p(self);
+            p.srcs.resize(nsrcs);
+            p.srcNumChannels.resize(nsrcs);
+            p.coeffs = self.coeffs;  // can be sorted
+            p.zeropoints = self.zeropoints;
+
+            bool sortInputs = false;
+            for( int i = 0; i < nsrcs; i++ )
+            {
+                p.srcs[i] = &srcs[i];
+                CV_CheckEQ(srcs[i].dims, dst.dims, "");
+                CV_Assert(srcs[i].isContinuous());
+                CV_Assert(srcs[i].type() == dst.type());
+                p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
+
+                if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
+                {
+                    CV_Assert(srcs[i].size == dst.size);
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
+                {
+                    if (i == 0)
+                        CV_Assert(srcs[0].size == dst.size);
+                    sortInputs = true;
+                }
+                else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
+                {
+                    CV_Assert(op == SUM);
+                    CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
+                    sortInputs = true;
+                }
+                else
+                {
+                    CV_Assert(0 && "Internal error");
+                }
+
+                if (sortInputs)
+                {
+                    // Sort srcs and coefficients in the desc order by number of channels
+                    for (int j = i; j >= 1; j--)
+                    {
+                        if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
+                        {
+                            std::swap(p.srcs[j - 1], p.srcs[j]);
+                            std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
+                            if (!p.coeffs.empty())
+                                std::swap(p.coeffs[j - 1], p.coeffs[j]);
+                            if (!p.zeropoints.empty())
+                                std::swap(p.zeropoints[j - 1], p.zeropoints[j]);
+                        }
+                        else
+                            break;
+                    }
+                }
+            }
+
+            p.nsrcs = nsrcs;
+            p.dst = &dst;
+            p.buf = &buf;
+            p.nstripes = nstripes;
+            p.offset = offset;
+            p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
+
+            p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
+            CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
+            p.activLUT = &self.activationLUT;
+            p.activ = !self.activationLUT.empty() ? self.activ.get() : 0;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            const EltwiseOp op = self.op;
+            size_t total = dst->size[0]*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+            const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
+            const int* zeropointsptr = !zeropoints.empty() ? &zeropoints[0] : 0;
+            const int8_t* lutptr = !activLUT->empty() ? activLUT->ptr<int8_t>() : 0;
+            int8_t* dstptr0 = dst->ptr<int8_t>();
+            float* bufptr0 = buf->ptr<float>();
+            int blockSize0 = 1 << 12;
+
+            for (size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / planeSize);
+                int delta = (int)ofs - sampleIdx * planeSize;
+                int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
+                if( blockSize <= 0 )
+                    break;
+                ofs += blockSize;
+
+                for (int c = 0; c < channels; c++)
+                {
+                    size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
+                    int8_t* dstptr = dstptr0 + dstIdx;
+                    float* bufptr = bufptr0 + dstIdx;
+
+                    // process first two inputs
+                    {
+                        const int8_t* srcptr0 = srcs[0]->ptr<int8_t>() + dstIdx;
+
+                        const int inputIdx = 1;
+                        int src1_channels = srcNumChannels[inputIdx];
+                        if (c >= src1_channels)
+                        {
+                            // no data from second input
+                            if (!coeffsptr)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = srcptr0[j];
+                                }
+                            }
+                            else
+                            {
+                                float c0 = coeffsptr[0];
+                                int z0 = op == PROD ? zeropointsptr[0] : 0;
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = c0 * (srcptr0[j] - z0);
+                                }
+                            }
+                        }
+                        else
+                        {
+                            size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
+                            const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
+
+                            if (op == PROD)
+                            {
+                                float c0 = coeffsptr[0];
+                                float c1 = coeffsptr[1];
+                                int z0 = zeropointsptr[0];
+                                int z1 = zeropointsptr[1];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = (c0*(srcptr0[j] - z0)) * (c1*(srcptrI[j] - z1));
+                                }
+                            }
+                            else if (op == MAX)
+                            {
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
+                                }
+                            }
+                            else if (op == SUM)
+                            {
+                                float c0 = coeffsptr[0];
+                                float c1 = coeffsptr[1];
+                                for (int j = 0; j < blockSize; j++)
+                                {
+                                    bufptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
+                                }
+                            }
+                            else
+                                CV_Error(Error::StsInternal, "");
+                        }
+                    }
+
+                    // aggregate other inputs (3+)
+                    for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
+                    {
+                        int srcI_channels = srcNumChannels[inputIdx];
+                        if (c >= srcI_channels)
+                            continue;  // no data from second input
+                        size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
+                        const int8_t* srcptrI = srcs[inputIdx]->ptr<int8_t>() + srcIdx;
+
+                        if (op == PROD)
+                        {
+                            float cI = coeffsptr[inputIdx];
+                            int zI = zeropointsptr[inputIdx];
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                bufptr[j] *= cI*(srcptrI[j] - zI);
+                            }
+                        }
+                        else if (op == MAX)
+                        {
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                dstptr[j] = std::max(dstptr[j], srcptrI[j]);
+                            }
+                        }
+                        else if (op == SUM)
+                        {
+                            float cI = coeffsptr[inputIdx];
+                            for (int j = 0; j < blockSize; j++)
+                            {
+                                bufptr[j] += cI * srcptrI[j];
+                            }
+                        }
+                        else
+                            CV_Error(Error::StsInternal, "");
+                    }
+
+                    // add offset and saturate cast to int8
+                    if (op == SUM || op == PROD)
+                    {
+                        for (int j = 0; j < blockSize; j++)
+                        {
+                            dstptr[j] = saturate_cast<int8_t>(std::round(bufptr[j] + offset));
+                        }
+                    }
+                }
+                if( activ )
+                {
+                    int8_t* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
+                    activ->forwardSlice(ptr, lutptr, ptr, blockSize, planeSize, 0, channels);
+                }
+            }
+        }
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(outputs.size() == 1);
+        const int nstripes = getNumThreads();
+
+        if (channelsModeInput == ELTWISE_CHANNNELS_SAME && inputs[0].dims > 2)
+        {
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                MatShape inpShape = shape(inputs[i].size);
+                bool allOnes = isAllOnes(inpShape, 2, inputs[i].dims);
+
+                if (allOnes)
+                {
+                    Mat tmpInput = inputs[i];
+                    MatShape outShape = shape(outputs[0].size);
+                    size_t xSize = outShape[2];
+                    for (size_t j = 3; j < outShape.size(); j++)
+                        xSize *= outShape[j];
+
+                    int dimVec[3] = {outShape[0], outShape[1], (int) xSize};
+                    std::vector<int> matSizesVec(&dimVec[0], &dimVec[0] + 3);
+                    inputs[i] = Mat(matSizesVec, tmpInput.type());
+
+                    std::vector<int> idx(outShape.size(), 0);
+                    std::vector<int> outIdx(inpShape.size(), 0);
+
+                    for (size_t j = 0; j < outShape[0]; j++)
+                    {
+                        outIdx[0] = idx[0] = j;
+                        for(size_t k = 0; k < outShape[1]; k++)
+                        {
+                            outIdx[1] = idx[1] = k;
+                            for (size_t x = 0; x < xSize; x++)
+                            {
+                                outIdx[2] = x;
+                                inputs[i].at<int8_t>(outIdx.data()) = tmpInput.at<int8_t>(idx.data());
+                            }
+                        }
+                    }
+                    inputs[i] = inputs[i].reshape(0, outShape);
+                }
+            }
+        }
+
+        Mat buf = Mat(shape(outputs[0]), CV_32F); // to store intermediate results
+        EltwiseInvoker::run(*this, &inputs[0], (int)inputs.size(), buf, outputs[0], nstripes, offset);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        CV_Assert(inputs.size());
+
+        // FIXIT: handle inputs with different number of channels
+        long flops = inputs.size() * total(inputs[0]);
+
+        return flops;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activationLUT = activ_int8->blobs[0];
+            return true;
+        }
+        return false;
+    }
+
+    Mat activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+
+private:
+    bool hasVecInput;
+    float offset;
+};
+
+Ptr<EltwiseLayerInt8> EltwiseLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<EltwiseLayerInt8>(new EltwiseLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/fully_connected_layer.cpp b/modules/dnn/src/int8layers/fully_connected_layer.cpp
new file mode 100644
index 000000000000..83da677a47f6
--- /dev/null
+++ b/modules/dnn/src/int8layers/fully_connected_layer.cpp
@@ -0,0 +1,266 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
+{
+public:
+    enum { VEC_ALIGN = 32 };
+    FullyConnectedLayerInt8Impl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        output_zp = params.get<int>("zeropoints");
+        axis = params.get<int>("axis", 1);
+        if (blobs.size() == 3)
+        {
+            // blobs[0] - Weights
+            // blobs[1] - Bias fused with offset
+            // blobs[2] - Multipliers for output stage
+            int numOutput = params.get<int>("num_output");
+            int innerSize = (int)blobs[0].total() / numOutput;
+
+            CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
+            CV_Assert((size_t)numOutput == blobs[1].total());
+
+            weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
+            int vecsize = weightsMat.cols;
+            if (vecsize % VEC_ALIGN != 0)
+            {
+                int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+                Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
+                Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
+                wpadding.setTo(Scalar::all(0));
+                weightsMat = weightsBuf.colRange(0, vecsize);
+                blobs[0].copyTo(weightsMat);
+            }
+            biasMat = blobs[1] = blobs[1].reshape(1, 1);
+            outputMultiplier = blobs[2];
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &) const CV_OVERRIDE
+    {
+        int numOutput, cAxis;
+        CV_CheckEQ(inputs.size(), (size_t)1, "");
+        CV_CheckEQ(blobs[0].dims, 2, "");
+        numOutput = blobs[0].size[0];
+        CV_Assert((size_t)numOutput == blobs[1].total());
+        cAxis = normalize_axis(axis, inputs[0]);
+
+        MatShape outShape(cAxis + 1);
+        for (int i = 0; i < cAxis; ++i)
+            outShape[i] = inputs[0][i];
+        outShape.back() = numOutput;
+
+        outputs.resize(1, outShape);
+        return false;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            activ = activ_int8;
+            if (!activ_int8->blobs.empty())
+                activ_int8->blobs[0].convertTo(activationLUT, CV_32S);
+            return true;
+        }
+        return false;
+    }
+
+    class FullyConnected : public ParallelLoopBody
+    {
+    public:
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), outputMultiplier(0), activationLUT(0), activ(0),
+                           dstMat(0), nstripes(0), outZp(0), useAVX2(false), useAVX512(false) {}
+
+        static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat, const Mat& outputMultiplier,
+                        const Mat& activationLUT, Mat& dstMat, const ActivationLayerInt8* activ, int nstripes, int outZp)
+        {
+            CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
+                       dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
+                       srcMat.type() == weights.type() && srcMat.type() == CV_8S &&
+                       dstMat.type() == CV_32S && biasMat.type() == CV_32S &&
+                       biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols );
+
+            FullyConnected p;
+
+            p.srcMat = &srcMat;
+            p.weights = &weights;
+            p.biasMat = &biasMat;
+            p.outputMultiplier = &outputMultiplier;
+            p.activationLUT = &activationLUT;
+            p.dstMat = &dstMat;
+            p.nstripes = nstripes;
+            p.outZp = outZp;
+            p.activ = !activationLUT.empty() ? activ : 0;
+            p.useAVX2 = checkHardwareSupport(CPU_AVX2);
+            p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int valign = FullyConnectedLayerInt8Impl::VEC_ALIGN;
+            int nsamples = srcMat->rows;
+            int nw0 = weights->rows;
+            int k, vecsize = srcMat->cols;
+            int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
+            size_t total = (size_t)nsamples*nw0;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
+            size_t wstep = weights->step1();
+            AutoBuffer<int8_t> srcbuf(vecsize_aligned + valign);
+            int8_t* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(int8_t)));
+            const int* lutptr = !activationLUT->empty() ? activationLUT->ptr<int>() : 0;
+
+            for( k = vecsize; k < vecsize_aligned; k++ )
+                sptr[k] = 0;
+
+            for( size_t ofs = stripeStart; ofs < stripeEnd; )
+            {
+                int sampleIdx = (int)(ofs / nw0);
+                int delta = (int)(ofs - (size_t)sampleIdx*nw0);
+                const int8_t* sptr_ = srcMat->ptr<int8_t>(sampleIdx);
+                const int8_t* wptr = weights->ptr<int8_t>(delta);
+                int* dptr = dstMat->ptr<int>(sampleIdx) + delta;
+                const int* biasptr = biasMat->ptr<int>() + delta;
+                const float* multptr = outputMultiplier->ptr<float>() + delta;
+                int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
+
+                memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
+            #if CV_TRY_AVX512_SKX
+                if( useAVX512 )
+                    opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
+            #if CV_TRY_AVX2
+                if( useAVX2 )
+                    opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
+                else
+            #endif
+                {
+                    int i = 0;
+            #if CV_SIMD
+                    for( ; i  <= nw - 4; i += 4, wptr += 4*wstep )
+                    {
+                        v_int32x4 vs0 = v_setzero_s32(), vs1 = v_setzero_s32(),
+                                  vs2 = v_setzero_s32(), vs3 = v_setzero_s32();
+                        v_int32x4 outzp = v_setall_s32(outZp), outmin = v_setall_s32(-128), outmax = v_setall_s32(127);
+                        v_int32x4 s = v_load(biasptr + i);
+                        v_float32x4 mult = v_load(multptr + i);
+
+                        for( k = 0; k < vecsize; k += 16 )
+                        {
+                            v_int8x16 v = v_load_aligned(sptr + k);
+                            vs0 = v_dotprod_expand_fast(v, v_load_aligned(wptr + k), vs0);
+                            vs1 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep + k), vs1);
+                            vs2 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*2 + k), vs2);
+                            vs3 = v_dotprod_expand_fast(v, v_load_aligned(wptr + wstep*3 + k), vs3);
+                        }
+
+                        s += v_int32x4(v_reduce_sum(vs0), v_reduce_sum(vs1), v_reduce_sum(vs2), v_reduce_sum(vs3));
+                        v_int32x4 out = outzp + v_round(v_cvt_f32(s)*mult);
+                        v_store(dptr + i, v_min(v_max(out, outmin), outmax));
+                    }
+            #endif
+
+                    for( ; i < nw; i++, wptr += wstep )
+                    {
+                        int s0 = biasptr[i];
+                        float mult0 = multptr[i];
+
+                        for( k = 0; k < vecsize; k++ )
+                        {
+                            int8_t v = sptr[k];
+                            s0 += (int)v*wptr[k];
+                        }
+                        int out0 = outZp + (int)std::round(s0*mult0);
+                        dptr[i] = std::min(std::max(out0, -128), 127);
+                    }
+                }
+
+                if(activ)
+                    activ->forwardSlice(dptr, lutptr, dptr, 1, 1, delta, delta + nw);
+
+                ofs += nw;
+            }
+        }
+
+        const Mat *srcMat, *weights, *biasMat, *outputMultiplier, *activationLUT;
+        const ActivationLayerInt8* activ;
+        Mat* dstMat;
+        int nstripes, outZp;
+        bool useAVX2;
+        bool useAVX512;
+    };
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> input, output;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+
+        int axisCan = normalize_axis(axis, input[0].dims);
+        int outerSize = input[0].total(0, axisCan);
+        Mat srcMat = input[0].reshape(1, outerSize);
+
+        Mat dstMat = output[0].reshape(1, outerSize);
+        Mat dstMatInt32= Mat(shape(dstMat), CV_32S);
+
+        const int nstripes = getNumThreads();
+        FullyConnected::run(srcMat, weightsMat, biasMat, outputMultiplier, activationLUT, dstMatInt32, activ.get(), nstripes, output_zp);
+        dstMatInt32.convertTo(dstMat, CV_8S);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+
+        int innerSize = blobs[0].size[1];
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
+        }
+
+        return flops;
+
+    }
+
+    Mat weightsMat, biasMat, outputMultiplier, activationLUT;
+    Ptr<ActivationLayerInt8> activ;
+};
+
+Ptr<InnerProductLayerInt8> InnerProductLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<InnerProductLayerInt8>(new FullyConnectedLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/layers_common.hpp b/modules/dnn/src/int8layers/layers_common.hpp
new file mode 100644
index 000000000000..cb185a9edaa4
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_common.hpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#define __OPENCV_DNN_LAYERS_LAYERS_COMMON_HPP__
+#include <opencv2/dnn.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#define CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+// dispatched AVX/AVX2 optimizations
+#include "./layers_common.simd.hpp"
+#include "int8layers/layers_common.simd_declarations.hpp"
+#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+#ifdef HAVE_OPENCL
+#include "../ocl4dnn/include/ocl4dnn.hpp"
+#endif
+
+namespace cv
+{
+namespace dnn
+{
+void getConvolutionKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<size_t>& pads_begin,
+                                std::vector<size_t>& pads_end, std::vector<size_t>& strides, std::vector<size_t>& dilations,
+                                cv::String &padMode, std::vector<size_t>& adjust_pads);
+
+void getPoolingKernelParams(const LayerParams &params, std::vector<size_t>& kernel, std::vector<bool>& globalPooling,
+                            std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end, std::vector<size_t>& strides, cv::String &padMode);
+
+void getConvPoolOutParams(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& stride, const String &padMode,
+                          const std::vector<size_t>& dilation, std::vector<int>& out);
+
+ void getConvPoolPaddings(const std::vector<int>& inp, const std::vector<size_t>& kernel,
+                          const std::vector<size_t>& strides, const String &padMode,
+                          std::vector<size_t>& pads_begin, std::vector<size_t>& pads_end);
+}
+}
+
+#endif
diff --git a/modules/dnn/src/int8layers/layers_common.simd.hpp b/modules/dnn/src/int8layers/layers_common.simd.hpp
new file mode 100644
index 000000000000..bf6149e5c958
--- /dev/null
+++ b/modules/dnn/src/int8layers/layers_common.simd.hpp
@@ -0,0 +1,637 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "opencv2/core/hal/intrin.hpp"
+
+namespace cv {
+namespace dnn {
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput );
+void fastDepthwiseConv( const int8_t* wptr,
+                        int kernel_h, int kernel_w,
+                        int stride_h, int stride_w,
+                        int dilation_h, int dilation_w,
+                        int pad_t, int pad_l,
+                        const int* biasptr, const float* multptr,
+                        const int8_t* inptr_,
+                        int height, int width,
+                        int* outptr_,
+                        int out_d, int outH, int outW,
+                        int inpZp, int outZp );
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp );
+
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_AVX2
+#define OPENCV_FMADD_EPI8(_Tpvec, func) \
+    inline _Tpvec _##func##_fmaddepi8_epi32(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { \
+        _Tpvec even_a = _##func##_srai_epi16(_##func##_bslli_epi128(a, 1), 8); \
+        _Tpvec odd_a  = _##func##_srai_epi16(a, 8);                            \
+                                                                               \
+        _Tpvec even_b = _##func##_srai_epi16(_##func##_bslli_epi128(b, 1), 8); \
+        _Tpvec odd_b  = _##func##_srai_epi16(b, 8);                            \
+                                                                               \
+        _Tpvec prod0  = _##func##_madd_epi16(even_a, even_b);                  \
+        _Tpvec prod1  = _##func##_madd_epi16(odd_a, odd_b);                    \
+        return _##func##_add_epi32(_##func##_add_epi32(prod0, prod1), c);      \
+    }
+OPENCV_FMADD_EPI8(__m256i, mm256)
+//OPENCV_FMADD_EPI8(__m512i, mm512)
+
+enum { FASCONV_BASE_VECSZ = 4 };
+
+void fastConv( const int8_t* weights, size_t wstep, const int* bias,
+               const int8_t* rowbuf, int* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned, int outZp,
+               const float* multiplier, bool initOutput, bool finalOutput )
+{
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    int CV_DECL_ALIGNED(16) maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    __m128 mask = _mm_loadu_ps((const float*)maskbuf);
+
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const int8_t* wptr0 = weights + i*wstep;
+        const int8_t* wptr1 = wptr0 + wstep;
+        const int8_t* wptr2 = wptr1 + wstep;
+        int* outptr0 = output + i*outPlaneSize;
+        int* outptr1 = outptr0 + outPlaneSize;
+        int* outptr2 = outptr1 + outPlaneSize;
+        int bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+        float mult0 = multiplier[i], mult1 = multiplier[i+1], mult2 = multiplier[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            mult2 = mult1;
+
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+                mult2 = mult1 = mult0;
+            }
+        }
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0)
+                    break;
+                j = blockSize - FASCONV_BASE_VECSZ;
+                tail = true;
+            }
+            int k = 0;
+            const int8_t* rptr = rowbuf + j*vecsize_aligned;
+
+            __m256i vs00 = _mm256_setzero_si256(), vs01 = _mm256_setzero_si256(),
+                    vs02 = _mm256_setzero_si256(), vs03 = _mm256_setzero_si256(),
+                    vs10 = _mm256_setzero_si256(), vs11 = _mm256_setzero_si256(),
+                    vs12 = _mm256_setzero_si256(), vs13 = _mm256_setzero_si256(),
+                    vs20 = _mm256_setzero_si256(), vs21 = _mm256_setzero_si256(),
+                    vs22 = _mm256_setzero_si256(), vs23 = _mm256_setzero_si256();
+
+            /* TODO : Fix AVX-512 path. Segmentation fault in Conv2D Tests.
+#if CV_AVX512_SKX // AVX512VL is necessary to avoid register spilling
+            if (vecsize >= 64)
+            {
+                __m512i vs00_5 = _mm512_setzero_si512(), vs01_5 = _mm512_setzero_si512(),
+                        vs02_5 = _mm512_setzero_si512(), vs03_5 = _mm512_setzero_si512(),
+                        vs10_5 = _mm512_setzero_si512(), vs11_5 = _mm512_setzero_si512(),
+                        vs12_5 = _mm512_setzero_si512(), vs13_5 = _mm512_setzero_si512(),
+                        vs20_5 = _mm512_setzero_si512(), vs21_5 = _mm512_setzero_si512(),
+                        vs22_5 = _mm512_setzero_si512(), vs23_5 = _mm512_setzero_si512();
+
+                for (; k <= vecsize - 64; k += 64, rptr += 64)
+                {
+                    __m512i w0 = _mm512_load_si512(wptr0 + k);
+                    __m512i w1 = _mm512_load_si512(wptr1 + k);
+                    __m512i w2 = _mm512_load_si512(wptr2 + k);
+                    __m512i r0 = _mm512_load_si512(rptr);
+
+                    vs00_5 = _mm512_fmaddepi8_epi32(w0, r0, vs00_5);
+                    vs10_5 = _mm512_fmaddepi8_epi32(w1, r0, vs10_5);
+                    vs20_5 = _mm512_fmaddepi8_epi32(w2, r0, vs20_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned);
+                    vs01_5 = _mm512_fmaddepi8_epi32(w0, r0, vs01_5);
+                    vs11_5 = _mm512_fmaddepi8_epi32(w1, r0, vs11_5);
+                    vs21_5 = _mm512_fmaddepi8_epi32(w2, r0, vs21_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned*2);
+                    vs02_5 = _mm512_fmaddepi8_epi32(w0, r0, vs02_5);
+                    vs12_5 = _mm512_fmaddepi8_epi32(w1, r0, vs12_5);
+                    vs22_5 = _mm512_fmaddepi8_epi32(w2, r0, vs22_5);
+
+                    r0 = _mm512_load_si512(rptr + vecsize_aligned*3);
+                    vs03_5 = _mm512_fmaddepi8_epi32(w0, r0, vs03_5);
+                    vs13_5 = _mm512_fmaddepi8_epi32(w1, r0, vs13_5);
+                    vs23_5 = _mm512_fmaddepi8_epi32(w2, r0, vs23_5);
+                }
+
+                // now fold the 512 bit accumulator vectors into 256 bit vectors so that the AVX2 code can finish
+                // the tail of the vector
+
+                vs00 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs00_5, 0), _mm512_extracti32x8_epi32(vs00_5, 1));
+                vs10 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs10_5, 0), _mm512_extracti32x8_epi32(vs10_5, 1));
+                vs20 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs20_5, 0), _mm512_extracti32x8_epi32(vs20_5, 1));
+
+                vs01 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs01_5, 0), _mm512_extracti32x8_epi32(vs01_5, 1));
+                vs11 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs11_5, 0), _mm512_extracti32x8_epi32(vs11_5, 1));
+                vs21 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs21_5, 0), _mm512_extracti32x8_epi32(vs21_5, 1));
+
+                vs02 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs02_5, 0), _mm512_extracti32x8_epi32(vs02_5, 1));
+                vs12 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs12_5, 0), _mm512_extracti32x8_epi32(vs12_5, 1));
+                vs22 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs22_5, 0), _mm512_extracti32x8_epi32(vs22_5, 1));
+
+                vs03 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs03_5, 0), _mm512_extracti32x8_epi32(vs03_5, 1));
+                vs13 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs13_5, 0), _mm512_extracti32x8_epi32(vs13_5, 1));
+                vs23 = _mm256_add_epi32( _mm512_extracti32x8_epi32(vs23_5, 0), _mm512_extracti32x8_epi32(vs23_5, 1));
+            }
+#endif
+            */
+            for (; k < vecsize; k += 32, rptr += 32 )
+            {
+                __m256i w0 = _mm256_load_si256((const __m256i*)(wptr0 + k));
+                __m256i w1 = _mm256_load_si256((const __m256i*)(wptr1 + k));
+                __m256i w2 = _mm256_load_si256((const __m256i*)(wptr2 + k));
+                __m256i r0 = _mm256_load_si256((const __m256i*)rptr);
+
+                vs00 = _mm256_fmaddepi8_epi32(w0, r0, vs00);
+                vs10 = _mm256_fmaddepi8_epi32(w1, r0, vs10);
+                vs20 = _mm256_fmaddepi8_epi32(w2, r0, vs20);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned));
+                vs01 = _mm256_fmaddepi8_epi32(w0, r0, vs01);
+                vs11 = _mm256_fmaddepi8_epi32(w1, r0, vs11);
+                vs21 = _mm256_fmaddepi8_epi32(w2, r0, vs21);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*2));
+                vs02 = _mm256_fmaddepi8_epi32(w0, r0, vs02);
+                vs12 = _mm256_fmaddepi8_epi32(w1, r0, vs12);
+                vs22 = _mm256_fmaddepi8_epi32(w2, r0, vs22);
+
+                r0 = _mm256_load_si256((const __m256i*)(rptr + vecsize_aligned*3));
+                vs03 = _mm256_fmaddepi8_epi32(w0, r0, vs03);
+                vs13 = _mm256_fmaddepi8_epi32(w1, r0, vs13);
+                vs23 = _mm256_fmaddepi8_epi32(w2, r0, vs23);
+            }
+
+            __m256i t0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs00, vs01), _mm256_hadd_epi32(vs02, vs03));
+            __m256i t1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs10, vs11), _mm256_hadd_epi32(vs12, vs13));
+            __m256i t2 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs20, vs21), _mm256_hadd_epi32(vs22, vs23));
+
+            t0 = _mm256_add_epi32(t0, _mm256_permute2x128_si256(t0, t0, 1));
+            t1 = _mm256_add_epi32(t1, _mm256_permute2x128_si256(t1, t1, 1));
+            t2 = _mm256_add_epi32(t2, _mm256_permute2x128_si256(t2, t2, 1));
+
+            __m128i s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = _mm_set1_epi32(bias0);
+                s1 = _mm_set1_epi32(bias1);
+                s2 = _mm_set1_epi32(bias2);
+            }
+            else
+            {
+                s0 = _mm_loadu_si128((__m128i*)(outptr0 + j));
+                s1 = _mm_loadu_si128((__m128i*)(outptr1 + j));
+                s2 = _mm_loadu_si128((__m128i*)(outptr2 + j));
+            }
+
+            s0 = _mm_add_epi32(s0, _mm256_castsi256_si128(t0));
+            s1 = _mm_add_epi32(s1, _mm256_castsi256_si128(t1));
+            s2 = _mm_add_epi32(s2, _mm256_castsi256_si128(t2));
+
+            if( finalOutput )
+            {
+                __m128i voutzp = _mm_set1_epi32(outZp);
+                __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
+                s0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s0), _mm_set1_ps(mult0))));
+                s1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s1), _mm_set1_ps(mult1))));
+                s2 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(s2), _mm_set1_ps(mult2))));
+
+                s0 = _mm_min_epi32(_mm_max_epi32(s0, outmin), outmax);
+                s1 = _mm_min_epi32(_mm_max_epi32(s1, outmin), outmax);
+                s2 = _mm_min_epi32(_mm_max_epi32(s2, outmin), outmax);
+            }
+            if( tail )
+            {
+                s0 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr0 + j),  _mm_castsi128_ps(s0), mask));
+                s1 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr1 + j),  _mm_castsi128_ps(s1), mask));
+                s2 =  _mm_castps_si128(_mm_blendv_ps(_mm_loadu_ps((const float*)outptr2 + j),  _mm_castsi128_ps(s2), mask));
+            }
+            _mm_storeu_si128((__m128i*)(outptr0 + j), s0);
+            _mm_storeu_si128((__m128i*)(outptr1 + j), s1);
+            _mm_storeu_si128((__m128i*)(outptr2 + j), s2);
+        }
+
+        for( ; j <= blockSize - 2; j += 2 )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            const int8_t* rptr1 = rowbuf + (j+1)*vecsize_aligned;
+            int s00, s01, s10, s11, s20, s21;
+
+            if( initOutput )
+            {
+                s00 = s01 = bias0;
+                s10 = s11 = bias1;
+                s20 = s21 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j]; s01 = outptr0[j+1];
+                s10 = outptr1[j]; s11 = outptr1[j+1];
+                s20 = outptr2[j]; s21 = outptr2[j+1];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+                r = rptr1[k];
+                s01 += (int)w0*r; s11 += (int)w1*r; s21 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s01 = std::min(std::max(outZp + (int)std::round(s01*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s11 = std::min(std::max(outZp + (int)std::round(s11*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+                s21 = std::min(std::max(outZp + (int)std::round(s21*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr0[j+1] = s01;
+            outptr1[j] = s10;
+            outptr1[j+1] = s11;
+            outptr2[j] = s20;
+            outptr2[j+1] = s21;
+        }
+
+        for( ; j < blockSize; j++ )
+        {
+            const int8_t* rptr0 = rowbuf + j*vecsize_aligned;
+            int s00, s10, s20;
+
+            if( initOutput )
+            {
+                s00 = bias0;
+                s10 = bias1;
+                s20 = bias2;
+            }
+            else
+            {
+                s00 = outptr0[j];
+                s10 = outptr1[j];
+                s20 = outptr2[j];
+            }
+
+            for( int k = 0; k < vecsize; k++ )
+            {
+                int8_t w0 = wptr0[k], w1 = wptr1[k], w2 = wptr2[k];
+                int8_t r = rptr0[k];
+                s00 += (int)w0*r; s10 += (int)w1*r; s20 += (int)w2*r;
+            }
+
+            if( finalOutput )
+            {
+                s00 = std::min(std::max(outZp + (int)std::round(s00*mult0), -128), 127);
+                s10 = std::min(std::max(outZp + (int)std::round(s10*mult1), -128), 127);
+                s20 = std::min(std::max(outZp + (int)std::round(s20*mult2), -128), 127);
+            }
+            outptr0[j] = s00;
+            outptr1[j] = s10;
+            outptr2[j] = s20;
+        }
+    }
+    _mm256_zeroupper();
+}
+
+static inline void _mm256_expand_mul_add(const __m256i& a, const __m256i& b,
+                                         __m256i& out0, __m256i& out1, __m256i& out2, __m256i& out3)
+{
+    __m256i a0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a));
+    __m256i a1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(a, 1));
+
+    __m256i b0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(b));
+    __m256i b1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(b, 1));
+
+    __m256i a0b0 = _mm256_mullo_epi16(a0, b0);
+    __m256i a1b1 = _mm256_mullo_epi16(a1, b1);
+
+    out0 = _mm256_add_epi32(out0, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a0b0)));
+    out1 = _mm256_add_epi32(out1, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a0b0, 1)));
+    out2 = _mm256_add_epi32(out2, _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a1b1)));
+    out3 = _mm256_add_epi32(out3, _mm256_cvtepi16_epi32(_mm256_extracti128_si256(a1b1, 1)));
+}
+
+static inline void _mm256_load_deinterleave(const int8_t* ptr, __m256i& a, __m256i& b)
+{
+    __m256i t0 = _mm256_loadu_si256((const __m256i*)ptr);
+    __m256i t1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
+
+    const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+                                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+    __m256i p0 = _mm256_shuffle_epi8(t0, sh);
+    __m256i p1 = _mm256_shuffle_epi8(t1, sh);
+    __m256i lo = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
+    __m256i hi = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
+    a = _mm256_unpacklo_epi64(lo, hi);
+    b = _mm256_unpackhi_epi64(lo, hi);
+}
+
+void fastDepthwiseConv( const int8_t* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const int* biasptr, const float* multptr,
+                     const int8_t* inptr_,
+                     int height, int width,
+                     int* outptr_,
+                     int out_d, int outH, int outW,
+                     int inpZp, int outZp)
+{
+    const int8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                 w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                 w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float mult = multptr[out_d];
+    int bias = biasptr[out_d];
+    int biasCopy;
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const int8_t* imgptr0 = inptr_ + in_i*width;
+        const int8_t* imgptr1 = imgptr0 + dilation_h*width;
+        const int8_t* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        int8_t w00 = w00_, w01 = w01_, w02 = w02_;
+        int8_t w20 = w20_, w21 = w21_, w22 = w22_;
+        int out;
+        biasCopy = bias;
+        if (in_i < 0)
+        {
+            biasCopy += inpZp * (w00 + w01 + w02);
+            w00 = w01 = w02 = 0;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            biasCopy += inpZp * (w20 + w21 + w22);
+            w20 = w21 = w22 = 0;
+            imgptr2 = imgptr1;
+        }
+        int* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = (int)imgptr0[0]*w01 + (int)imgptr0[dilation_w]*w02 +
+                  (int)imgptr1[0]*w11 + (int)imgptr1[dilation_w]*w12 +
+                  (int)imgptr2[0]*w21 + (int)imgptr2[dilation_w]*w22 +
+                  biasCopy + inpZp*(w00 + w10 + w20);
+            outptr[0] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 32;
+            __m256i vw00 = _mm256_set1_epi8(w00), vw01 = _mm256_set1_epi8(w01), vw02 = _mm256_set1_epi8(w02),
+                    vw10 = _mm256_set1_epi8(w10), vw11 = _mm256_set1_epi8(w11), vw12 = _mm256_set1_epi8(w12),
+                    vw20 = _mm256_set1_epi8(w20), vw21 = _mm256_set1_epi8(w21), vw22 = _mm256_set1_epi8(w22);
+            __m256i vbias = _mm256_set1_epi32(biasCopy), voutzp = _mm256_set1_epi32(outZp),
+                    outmin = _mm256_set1_epi32(-128), outmax = _mm256_set1_epi32(127);
+            __m256 vmult = _mm256_set1_ps(mult);
+            __m256i vout0, vout1, vout2, vout3;
+
+            if( stride_w == 1 )
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j)),
+                            v01 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w)),
+                            v02 = _mm256_loadu_si256((const __m256i*)(imgptr0 + in_j + dilation_w*2)),
+                            v10 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j)),
+                            v11 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w)),
+                            v12 = _mm256_loadu_si256((const __m256i*)(imgptr1 + in_j + dilation_w*2)),
+                            v20 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j)),
+                            v21 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w)),
+                            v22 = _mm256_loadu_si256((const __m256i*)(imgptr2 + in_j + dilation_w*2));
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
+                    vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
+                    vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
+                    vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
+
+                    vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
+                    vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
+                    vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
+                    vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
+
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
+                }
+            }
+            else
+            {
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1)
+                    {
+                        if (out_j <= pad_l)
+                            break;
+                        out_j = outW1 - VECSZ;
+                    }
+                    int in_j = out_j * stride_w - pad_l;
+                    __m256i v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    _mm256_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    _mm256_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    _mm256_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    _mm256_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    _mm256_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    _mm256_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    vout0 = vout1 = vout2 = vout3 = vbias;
+                    _mm256_expand_mul_add(v00, vw00, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v01, vw01, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v02, vw02, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v10, vw10, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v11, vw11, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v12, vw12, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v20, vw20, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v21, vw21, vout0, vout1, vout2, vout3);
+                    _mm256_expand_mul_add(v22, vw22, vout0, vout1, vout2, vout3);
+
+                    vout0 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout0), vmult)));
+                    vout1 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout1), vmult)));
+                    vout2 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout2), vmult)));
+                    vout3 = _mm256_add_epi32(voutzp, _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(vout3), vmult)));
+
+                    vout0 = _mm256_min_epi32(_mm256_max_epi32(vout0, outmin), outmax);
+                    vout1 = _mm256_min_epi32(_mm256_max_epi32(vout1, outmin), outmax);
+                    vout2 = _mm256_min_epi32(_mm256_max_epi32(vout2, outmin), outmax);
+                    vout3 = _mm256_min_epi32(_mm256_max_epi32(vout3, outmin), outmax);
+
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j), vout0);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 8), vout1);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 16), vout2);
+                    _mm256_storeu_si256((__m256i*)(outptr + out_j + 24), vout3);
+                }
+            }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = (int)imgptr0[in_j]*w00 + (int)imgptr0[in_j + dilation_w]*w01 + (int)imgptr0[in_j + dilation_w*2]*w02 +
+                  (int)imgptr1[in_j]*w10 + (int)imgptr1[in_j + dilation_w]*w11 + (int)imgptr1[in_j + dilation_w*2]*w12 +
+                  (int)imgptr2[in_j]*w20 + (int)imgptr2[in_j + dilation_w]*w21 + (int)imgptr2[in_j + dilation_w*2]*w22 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            int s0 = 1, s1 = 1, s2 = 1;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0;
+                biasCopy += inpZp*(w00 + w10 + w20);
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0;
+                biasCopy += inpZp*(w01 + w11 + w21);
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0;
+                biasCopy += inpZp*(w02 + w12 + w22);
+            }
+            out = (int)imgptr0[in_j0]*w00*s0 + (int)imgptr0[in_j1]*w01*s1 + (int)imgptr0[in_j2]*w02*s2 +
+                  (int)imgptr1[in_j0]*w10*s0 + (int)imgptr1[in_j1]*w11*s1 + (int)imgptr1[in_j2]*w12*s2 +
+                  (int)imgptr2[in_j0]*w20*s0 + (int)imgptr2[in_j1]*w21*s1 + (int)imgptr2[in_j2]*w22*s2 + biasCopy;
+            outptr[out_j] = std::min(std::max(outZp + (int)std::round(out*mult), -128), 127);
+        }
+    }
+    _mm256_zeroupper();
+}
+
+// dst = vec * weights^t + bias
+void fastGEMM1T( const int8_t* vec, const int8_t* weights,
+                 size_t wstep, const int* bias, const float* multiplier,
+                 int* dst, int nvecs, int vecsize, int outZp )
+{
+    int i = 0;
+
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = _mm256_setzero_si256(), vs1 = _mm256_setzero_si256(),
+                vs2 = _mm256_setzero_si256(), vs3 = _mm256_setzero_si256(),
+                vs4 = _mm256_setzero_si256(), vs5 = _mm256_setzero_si256(),
+                vs6 = _mm256_setzero_si256(), vs7 = _mm256_setzero_si256();
+
+        __m128i voutzp = _mm_set1_epi32(outZp);
+        __m128i outmin = _mm_set1_epi32(-128), outmax = _mm_set1_epi32(127);
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = _mm256_load_si256((const __m256i*)(vec + k));
+
+            vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
+            vs1 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep)), v, vs1);
+            vs2 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*2)), v, vs2);
+            vs3 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*3)), v, vs3);
+            vs4 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*4)), v, vs4);
+            vs5 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*5)), v, vs5);
+            vs6 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*6)), v, vs6);
+            vs7 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)(wptr + wstep*7)), v, vs7);
+        }
+
+        __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs1), _mm256_hadd_epi32(vs2, vs3));
+        __m256i s1 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs4, vs5), _mm256_hadd_epi32(vs6, vs7));
+
+        s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
+        s1 = _mm256_add_epi32(s1, _mm256_permute2x128_si256(s1, s1, 1));
+
+        __m128i t0 = _mm_add_epi32(_mm256_castsi256_si128(s0), _mm_loadu_si128((__m128i*)(bias + i)));
+        __m128i t1 = _mm_add_epi32(_mm256_castsi256_si128(s1), _mm_loadu_si128((__m128i*)(bias + i + 4)));
+
+        t0 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t0), _mm_loadu_ps(multiplier + i))));
+        t1 = _mm_add_epi32(voutzp, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(t1), _mm_loadu_ps(multiplier + i + 4))));
+
+        t0 = _mm_min_epi32(_mm_max_epi32(t0, outmin), outmax);
+        t1 = _mm_min_epi32(_mm_max_epi32(t1, outmin), outmax);
+
+        _mm_storeu_si128((__m128i*)(dst + i), t0);
+        _mm_storeu_si128((__m128i*)(dst + i + 4), t1);
+    }
+
+    for( ; i < nvecs; i++ )
+    {
+        const int8_t* wptr = weights + i*wstep;
+        __m256i vs0 = _mm256_setzero_si256();
+
+        for( int k = 0; k < vecsize; k += 32, wptr += 32 )
+        {
+            __m256i v = _mm256_load_si256((const __m256i*)(vec + k));
+            vs0 = _mm256_fmaddepi8_epi32(_mm256_load_si256((const __m256i*)wptr), v, vs0);
+        }
+
+        __m256i s0 = _mm256_hadd_epi32(_mm256_hadd_epi32(vs0, vs0), vs0);
+        s0 = _mm256_add_epi32(s0, _mm256_permute2x128_si256(s0, s0, 1));
+        int temp = _mm_extract_epi32(_mm256_castsi256_si128(s0), 0);
+        dst[i] = outZp + (int)std::round((temp + bias[i]) * multiplier[i]);
+    }
+
+    _mm256_zeroupper();
+}
+#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+}} // namespace
diff --git a/modules/dnn/src/int8layers/pooling_layer.cpp b/modules/dnn/src/int8layers/pooling_layer.cpp
new file mode 100644
index 000000000000..20a0486a4625
--- /dev/null
+++ b/modules/dnn/src/int8layers/pooling_layer.cpp
@@ -0,0 +1,595 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <float.h>
+#include <algorithm>
+#include <numeric>
+using std::max;
+using std::min;
+
+namespace cv
+{
+namespace dnn
+{
+
+class PoolingLayerInt8Impl CV_FINAL : public PoolingLayerInt8
+{
+public:
+    PoolingLayerInt8Impl(const LayerParams& params)
+    {
+        computeMaxIdx = false;
+        globalPooling = false;
+        isGlobalPooling = std::vector<bool>(3, false);
+        output_zp = params.get<int>("zeropoints");
+        input_zp = params.get<int>("input_zeropoint", 0);
+        multiplier = params.get<float>("multiplier", 1.f);
+
+        hasDynamicShapes = params.get<bool>("has_dynamic_shapes", false);
+        shapesInitialized = !hasDynamicShapes;
+
+        if (params.has("pool") || params.has("kernel_size") ||
+            params.has("kernel_w") || params.has("kernel_h"))
+        {
+            String pool = toLowerCase(params.get<String>("pool", "max"));
+            if (pool == "max")
+                type = MAX;
+            else if (pool == "ave")
+                type = AVE;
+            else if (pool == "sum")
+                type = SUM;
+            else
+                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
+
+            getPoolingKernelParams(params, kernel_size, isGlobalPooling, pads_begin, pads_end, strides, padMode);
+            globalPooling = isGlobalPooling[0] || isGlobalPooling[1] || isGlobalPooling[2];
+        }
+        else
+            CV_Error(Error::StsBadArg, "Cannot determine pooling type");
+        setParamsFrom(params);
+        ceilMode = params.get<bool>("ceil_mode", true);
+        spatialScale = params.get<float>("spatial_scale", 1);
+        avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        CV_Assert(!inputs.empty());
+        CV_Assert(outputs.size() == 1);
+
+        std::vector<int> inp;
+        std::vector<int> out;
+        for (int i = 2; i < inputs[0].dims; i++) {
+            inp.push_back(inputs[0].size[i]);
+            out.push_back(outputs[0].size[i]);
+        }
+        if (globalPooling) {
+            std::vector<size_t> finalKernel;
+            for (int i = 0; i < inp.size(); i++) {
+                int idx = isGlobalPooling.size() - inp.size() + i;
+                finalKernel.push_back(isGlobalPooling[idx] ? inp[i] : kernel_size[idx]);
+             }
+             kernel_size = finalKernel;
+         }
+
+        getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
+
+        if (inputs[0].dims == 3)
+        {
+            // Pool1D
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+            pads_begin.assign(1, pads_begin[0]);
+            pads_end.assign(1, pads_end[0]);
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        if (backendId == DNN_BACKEND_OPENCV)
+        {
+            if (kernel_size.size() == 3)
+                return preferableTarget == DNN_TARGET_CPU;
+            if (kernel_size.size() <= 2)
+                return true;
+            else
+                return false;
+        }
+        return false;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        switch (type)
+        {
+            case MAX:
+            {
+                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
+                maxPooling(inputs[0], outputs[0]);
+                break;
+            }
+            case AVE: case SUM:
+                CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
+                avePooling(inputs[0], outputs[0]);
+                break;
+            default:
+                CV_Error(Error::StsNotImplemented, "Not implemented");
+                break;
+        }
+    }
+
+    class PoolingInvoker : public ParallelLoopBody
+    {
+    public:
+        const Mat* src, *rois;
+        Mat *dst;
+        int pad_l, pad_t, pad_r, pad_b;
+        bool avePoolPaddedArea;
+        int nstripes, inpZp, outZp;
+        std::vector<int> ofsbuf;
+        int poolingType;
+        float multiplier;
+        float spatialScale;
+
+        std::vector<size_t> pads_begin, pads_end;
+        std::vector<size_t> kernel_size;
+        std::vector<size_t> strides;
+
+        PoolingInvoker() : src(0), rois(0), dst(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
+                           avePoolPaddedArea(false), nstripes(0), inpZp(0), outZp(0),
+                           poolingType(MAX), multiplier(1), spatialScale(0){}
+
+        static void run(const Mat& src, const Mat& rois, Mat& dst,
+                        std::vector<size_t> kernel_size, std::vector<size_t> strides,
+                        std::vector<size_t> pads_begin, std::vector<size_t> pads_end,
+                        bool avePoolPaddedArea, int poolingType, float spatialScale,
+                        float multiplier, int inpZp, int outZp, int nstripes)
+        {
+            CV_Assert_N(
+                      src.isContinuous(), dst.isContinuous(),
+                      src.type() == CV_8S, src.type() == dst.type(),
+                      src.dims == 3 || src.dims == 4 || src.dims == 5, dst.dims == 3 || dst.dims == 4 || dst.dims == 5,
+                      src.size[0] == dst.size[0], src.size[1] == dst.size[1], rois.empty());
+
+            PoolingInvoker p;
+
+            bool isPool1D = src.dims == 3;
+            bool isPool3D = src.dims == 5;
+
+            p.src = &src;
+            p.rois = &rois;
+            p.dst = &dst;
+
+            p.kernel_size = kernel_size;
+            p.strides = strides;
+            p.pads_begin = pads_begin;
+            p.pads_end = pads_end;
+
+            p.pad_l = pads_begin.back();
+            p.pad_t = isPool1D ? 0 : pads_begin[pads_begin.size() - 2];
+            p.pad_r = pads_end.back();
+            p.pad_b = isPool1D ? 0 : pads_end[pads_end.size() - 2];
+
+            p.avePoolPaddedArea = avePoolPaddedArea;
+            p.nstripes = nstripes;
+            p.inpZp = inpZp;
+            p.outZp = outZp;
+            p.poolingType = poolingType;
+            p.spatialScale = spatialScale;
+            p.multiplier = multiplier;
+
+            int height = isPool1D ? 1 : src.size[src.dims - 2];
+            int width = src.size[src.dims - 1];
+
+            int kernel_d = isPool3D ? kernel_size[0] : 1;
+            int kernel_h = isPool1D ? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            p.ofsbuf.resize(kernel_d * kernel_h * kernel_w);
+            for (int i = 0; i < kernel_d; ++i) {
+                for (int j = 0; j < kernel_h; ++j) {
+                    for (int k = 0; k < kernel_w; ++k) {
+                        p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k;
+                    }
+                }
+            }
+
+            parallel_for_(Range(0, nstripes), p, nstripes);
+        }
+
+        void operator()(const Range& r) const CV_OVERRIDE
+        {
+            int channels = dst->size[1];
+
+            bool isPool3D = src->dims == 5;
+            bool isPool2D = src->dims == 4;
+            bool isPool1D = src->dims == 3;
+            int depth = isPool3D? dst->size[2] : 1;
+            int height = isPool1D? 1 : dst->size[dst->dims - 2];
+            int width = dst->size[dst->dims - 1];
+
+            int inp_depth = isPool3D? src->size[2] : 1;
+            int inp_height = isPool1D? 1 : src->size[src->dims - 2];
+            int inp_width = src->size[src->dims - 1];
+
+            size_t total = dst->total();
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(r.end*stripeSize, total);
+
+            int kernel_d = isPool3D? kernel_size[0] : 1;
+            int kernel_h = isPool1D? 1 : kernel_size[kernel_size.size() - 2];
+            int kernel_w = kernel_size.back();
+
+            int stride_d = isPool3D? strides[0] : 0;
+            int stride_h = isPool1D? 1 :strides[strides.size() - 2];
+            int stride_w = strides.back();
+
+#if CV_SIMD128
+            const int* ofsptr = (const int*)&ofsbuf[0];
+            if (poolingType == MAX && !ofsptr)
+                CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
+#endif
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ofs = ofs0;
+                int x0 = (int)(ofs % width);
+                ofs /= width;
+                int y0 = (int)(ofs % height);
+                ofs /= height;
+
+                int d0 = (int)(ofs % depth);
+                ofs /= depth;
+
+                int c = (int)(ofs % channels);
+                int n = (int)(ofs / channels);
+                int ystart, yend;
+                int dstart = 0, dend = 1;
+
+                const int8_t *srcData = 0;
+                int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0;
+                dstart = d0 * stride_d - pad_d_begin;
+                dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0]));
+
+                ystart = y0 * stride_h - pad_t;
+                yend = min(ystart + kernel_h, inp_height + pad_b);
+                srcData = src->ptr<int8_t>(n, c);
+
+                int ddelta = dend - dstart;
+                dstart = max(dstart, 0);
+                dend = min(dend, inp_depth);
+                int ydelta = yend - ystart;
+                ystart = max(ystart, 0);
+                yend = min(yend, inp_height);
+                int8_t *dstData = &dst->ptr<int8_t>(n, c, d0)[y0 * width];
+
+                int delta = std::min((int)(stripeEnd - ofs0), width - x0);
+                ofs0 += delta;
+                int x1 = x0 + delta;
+
+                if( poolingType == MAX )
+                    for( ; x0 < x1; x0++ )
+                    {
+                        int xstart = x0 * stride_w - pad_l;
+                        int xend = min(xstart + kernel_w, inp_width);
+                        xstart = max(xstart, 0);
+                        if (xstart >= xend || ystart >= yend)
+                        {
+                            dstData[x0] = (int8_t)outZp;
+                            continue;
+                        }
+#if CV_SIMD128
+                        if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
+                        {
+                            v_int8x16 max_val0 = v_setall_s8(-128);
+                            if( yend - ystart == kernel_h )
+                            {
+                                const int8_t* srcData1 = srcData + ystart*inp_width + xstart;
+                                if( stride_w == 1 )
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0 = v_load(srcData1 + index);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                else if( stride_w == 2 )
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0, dummy;
+                                        v_load_deinterleave(srcData1 + index, v0, dummy);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                else
+                                    for (int k = 0; k < kernel_w*kernel_h; k++)
+                                    {
+                                        int index = ofsptr[k];
+                                        v_int8x16 v0(srcData1[index], srcData1[index + stride_w],
+                                                     srcData1[index + stride_w*2], srcData1[index + stride_w*3],
+                                                     srcData1[index + stride_w*4], srcData1[index + stride_w*5],
+                                                     srcData1[index + stride_w*6], srcData1[index + stride_w*7],
+                                                     srcData1[index + stride_w*8], srcData1[index + stride_w*9],
+                                                     srcData1[index + stride_w*10], srcData1[index + stride_w*11],
+                                                     srcData1[index + stride_w*12], srcData1[index + stride_w*13],
+                                                     srcData1[index + stride_w*14], srcData1[index + stride_w*15]);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                            }
+                            else
+                            {
+                                for (int y = ystart; y < yend; ++y)
+                                {
+                                    for (int x = xstart; x < xend; ++x)
+                                    {
+                                        const int index = y * inp_width + x;
+                                        v_int8x16 v0(srcData[index], srcData[index + stride_w],
+                                                     srcData[index + stride_w*2], srcData[index + stride_w*3],
+                                                     srcData[index + stride_w*4], srcData[index + stride_w*5],
+                                                     srcData[index + stride_w*6], srcData[index + stride_w*7],
+                                                     srcData[index + stride_w*8], srcData[index + stride_w*9],
+                                                     srcData[index + stride_w*10], srcData[index + stride_w*11],
+                                                     srcData[index + stride_w*12], srcData[index + stride_w*13],
+                                                     srcData[index + stride_w*14], srcData[index + stride_w*15]);
+                                        max_val0 = v_max(max_val0, v0);
+                                    }
+                                }
+                            }
+                            v_store(dstData + x0, max_val0);
+                            x0 += 15;
+                        }
+                        else
+#else
+                        CV_UNUSED(isPool2D);
+#endif
+                        if( isPool1D )
+                        {
+                            const int8_t* first = srcData + xstart;
+                            const int8_t* last = srcData + xend;
+                            const int8_t* max_elem = std::max_element(first, last);
+                            if (max_elem != last)
+                                dstData[x0] = *max_elem;
+                        }
+                        else
+                        {
+                            int8_t max_val = -128;
+                            for (int d = dstart; d < dend; ++d) {
+                                for (int y = ystart; y < yend; ++y) {
+                                    for (int x = xstart; x < xend; ++x) {
+                                        const int index = d * inp_width * inp_height + y * inp_width + x;
+                                        int8_t val = srcData[index];
+                                        max_val = std::max(max_val, val);
+                                    }
+                                }
+                            }
+                            dstData[x0] = max_val;
+                        }
+                    }
+                else if (poolingType == AVE || poolingType == SUM)
+                {
+                    for( ; x0 < x1; ++x0)
+                    {
+                        int xstart = x0 * stride_w - pad_l;
+                        int xend = min(xstart + kernel_w, inp_width + pad_r);
+                        int xdelta = xend - xstart;
+                        xstart = max(xstart, 0);
+                        xend = min(xend, inp_width);
+
+                        int real_kernel_area = (dend - dstart) * (yend - ystart) * (xend - xstart);
+                        int padded_kernel_area = xdelta * ydelta * ddelta;
+                        int kernel_area = avePoolPaddedArea ? padded_kernel_area : real_kernel_area;
+
+                        int bias = (avePoolPaddedArea ? (padded_kernel_area - real_kernel_area) * inpZp : 0)
+                                 - (inpZp * kernel_area);
+                        float inv_kernel_area = poolingType == AVE ? multiplier / kernel_area : multiplier;
+#if CV_SIMD128
+                        if( isPool2D && xstart > 0 && x0 + 15 < x1 && (x0 + 15) * stride_w - pad_l + kernel_w < inp_width )
+                        {
+                            v_int32x4 sum_val0 = v_setall_s32(bias), sum_val1 = v_setall_s32(bias),
+                                      sum_val2 = v_setall_s32(bias), sum_val3 = v_setall_s32(bias),
+                                      voutzp = v_setall_s32(outZp);
+                            v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
+
+                            for (int y = ystart; y < yend; ++y)
+                            {
+                                for (int x = xstart; x < xend; ++x)
+                                {
+                                    const int index = y * inp_width + x;
+                                    v_int32x4 v0((int)srcData[index], (int)srcData[index + stride_w],
+                                                 (int)srcData[index + stride_w*2], (int)srcData[index + stride_w*3]);
+                                    v_int32x4 v1((int)srcData[index + stride_w*4], (int)srcData[index + stride_w*5],
+                                                 (int)srcData[index + stride_w*6], (int)srcData[index + stride_w*7]);
+                                    v_int32x4 v2((int)srcData[index + stride_w*8], (int)srcData[index + stride_w*9],
+                                                 (int)srcData[index + stride_w*10], (int)srcData[index + stride_w*11]);
+                                    v_int32x4 v3((int)srcData[index + stride_w*12], (int)srcData[index + stride_w*13],
+                                                 (int)srcData[index + stride_w*14], (int)srcData[index + stride_w*15]);
+                                    sum_val0 += v0;
+                                    sum_val1 += v1;
+                                    sum_val2 += v2;
+                                    sum_val3 += v3;
+                                }
+                            }
+
+                            sum_val0 = v_round(v_cvt_f32(sum_val0)*ikarea) + voutzp;
+                            sum_val1 = v_round(v_cvt_f32(sum_val1)*ikarea) + voutzp;
+                            sum_val2 = v_round(v_cvt_f32(sum_val2)*ikarea) + voutzp;
+                            sum_val3 = v_round(v_cvt_f32(sum_val3)*ikarea) + voutzp;
+
+                            v_store(dstData + x0, v_pack(v_pack(sum_val0, sum_val1), v_pack(sum_val2, sum_val3)));
+                            x0 += 15;
+                        }
+                        else
+#endif
+                        if( isPool1D )
+                        {
+                            const int8_t* first = srcData + xstart;
+                            const int8_t* last = srcData + xend;
+                            int sum_val = bias + std::accumulate(first, last, 0);
+                            dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
+                        }
+                        else
+                        {
+                            int sum_val = bias;
+                            for (int d = dstart; d < dend; ++d) {
+                                for (int y = ystart; y < yend; ++y) {
+                                    for (int x = xstart; x < xend; ++x) {
+                                        const int index = d * inp_width * inp_height + y * inp_width + x;
+                                        int8_t val = srcData[index];
+                                        sum_val += (int)val;
+                                    }
+                                }
+                            }
+                            dstData[x0] = saturate_cast<int8_t>(outZp + std::round(sum_val*inv_kernel_area));
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    void maxPooling(Mat &src, Mat &dst)
+    {
+        const int nstripes = getNumThreads();
+        Mat rois;
+        PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
+                            spatialScale, multiplier, input_zp, output_zp, nstripes);
+    }
+
+    void avePooling(Mat &src, Mat &dst)
+    {
+        const int nstripes = getNumThreads();
+        Mat rois;
+        PoolingInvoker::run(src, rois, dst, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type,
+                            spatialScale, multiplier, input_zp, output_zp, nstripes);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() != 0);
+
+        bool isPool1D = inputs[0].size() == 3;
+        std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
+        std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
+
+        std::vector<size_t> local_kernel;
+        if (globalPooling) {
+            for (int i = 0; i < inpShape.size(); i++) {
+                int idx = isGlobalPooling.size() - inpShape.size() + i;
+                local_kernel.push_back(isGlobalPooling[idx] ? inpShape[i] : kernel_size[idx]);
+            }
+        } else {
+            local_kernel = kernel_size;
+        }
+
+        if (hasDynamicShapes && !shapesInitialized)
+        {
+            //Just copy input shapes for width and height to prevent errors on loading stage
+            for (int i = 0; i < inpShape.size(); i++)
+                outShape.push_back(inpShape[i]);
+        }
+        else if (padMode.empty())
+        {
+            int addedDims = isPool1D? inpShape.size() : local_kernel.size();
+            for (int i = 0; i < addedDims; i++) {
+                float dst = (float) (inpShape[i] + pads_begin[i] + pads_end[i] - local_kernel[i]) / strides[i];
+                outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
+            }
+
+            // If we have padding, ensure that the last pooling starts strictly
+            // inside the image (instead of at the padding); otherwise clip the last.
+            for (int i = 0; i < addedDims; i++) {
+                if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
+                    --outShape[2 + i];
+                    CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
+                }
+            }
+        }
+        else {
+            getConvPoolOutParams(inpShape, local_kernel, strides, padMode,
+                                 std::vector<size_t>(local_kernel.size(), 1), outShape);
+        }
+
+        outputs.assign(1, outShape);
+        return false;
+    }
+
+    bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        int dims = inputs[0].size();
+        CV_Assert(inputs[0][dims - 1] > 0 && inputs[0][dims - 2] > 0);
+        shapesInitialized = true;
+        return true;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(inputs); // suppress unused variable warning
+        long flops = 0;
+        bool isPool1D = inputs[0].size() == 3;
+        size_t karea = std::accumulate(kernel_size.begin(), isPool1D? kernel_size.begin() + 1 : kernel_size.end(),
+                                    1, std::multiplies<size_t>());
+        for(int i = 0; i < outputs.size(); i++)
+        {
+            if (type == MAX)
+            {
+                if (i%2 == 0)
+                    flops += total(outputs[i])*karea;
+            }
+            else
+            {
+                flops += total(outputs[i])*(karea + 1);
+            }
+        }
+        return flops;
+    }
+private:
+    enum Type
+    {
+        MAX,
+        AVE,
+        STOCHASTIC,
+        SUM,
+        ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
+        PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
+    };
+    bool hasDynamicShapes;
+    bool shapesInitialized;
+    float multiplier;
+};
+
+Ptr<PoolingLayerInt8> PoolingLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<PoolingLayerInt8>(new PoolingLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp
new file mode 100644
index 000000000000..2ddb76a0e80d
--- /dev/null
+++ b/modules/dnn/src/int8layers/quantize_dequantize_layer.cpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+class QuantizeLayerImpl CV_FINAL : public QuantizeLayer
+{
+public:
+    QuantizeLayerImpl(const LayerParams& params)
+    {
+        scale = params.get<float>("scales", 1.0f);
+        zeropoint = params.get<int>("zeropoints", 0);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs, outputs;
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        if (inputs_.depth() == CV_16S)
+        {
+            UMat inputFp32(shape(inputs[0]), CV_32F);
+            convertFp16(inputs[0], inputFp32);
+            inputFp32.copyTo(inputs[0]);
+        }
+
+        inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0].convertTo(outputs[0], CV_8S, 1.f/scale, zeropoint);
+    }
+};
+
+class DequantizeLayerImpl CV_FINAL : public DequantizeLayer
+{
+public:
+    DequantizeLayerImpl(const LayerParams& params)
+    {
+        scale = params.get<float>("scales", 1.0f);
+        zeropoint = params.get<int>("zeropoints", 0);
+        setParamsFrom(params);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return false;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+    }
+
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
+    {
+        std::vector<UMat> inputs, outputs;
+        inputs_.getUMatVector(inputs);
+        outputs_.getUMatVector(outputs);
+
+        UMat outputFp32(shape(outputs[0]), CV_32F);
+        inputs[0].convertTo(outputFp32, CV_32F, scale, -(scale*zeropoint));
+
+        if (outputs_.depth() == CV_16S)
+            convertFp16(outputFp32, outputs[0]);
+        else
+            outputFp32.copyTo(outputs[0]);
+        return true;
+    }
+#endif
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        inputs[0].convertTo(outputs[0], CV_32F, scale, -(scale*zeropoint));
+    }
+};
+
+Ptr<QuantizeLayer> QuantizeLayer::create(const LayerParams& params)
+{
+    return Ptr<QuantizeLayer>(new QuantizeLayerImpl(params));
+}
+
+Ptr<DequantizeLayer> DequantizeLayer::create(const LayerParams& params)
+{
+    return Ptr<DequantizeLayer>(new DequantizeLayerImpl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/int8layers/scale_layer.cpp b/modules/dnn/src/int8layers/scale_layer.cpp
new file mode 100644
index 000000000000..d7f676d047ab
--- /dev/null
+++ b/modules/dnn/src/int8layers/scale_layer.cpp
@@ -0,0 +1,211 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class ScaleLayerInt8Impl CV_FINAL : public ScaleLayerInt8
+{
+public:
+    Mat weights, bias;
+    ScaleLayerInt8Impl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        hasBias = params.get<bool>("bias_term", false);
+        axis = params.get<int>("axis", 1);
+        hasWeights = false;
+
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+
+        DictValue inpSc = params.get("input_scales");
+        DictValue inpZp = params.get("input_zeropoints");
+
+        for (int i = 0; i < inpSc.size(); i++)
+        {
+            inp_sc.push_back(inpSc.get<float>(i));
+            inp_zp.push_back(inpZp.get<int>(i));
+        }
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        outputs.assign(1, inputs[0]);
+        return true;
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> inputs;
+        inputs_arr.getMatVector(inputs);
+        hasWeights = blobs.size() == 2 || (blobs.size() <= 1 && !hasBias);
+        CV_Assert((inputs.size() == 2 && blobs.empty()) || blobs.size() == (int)hasWeights + (int)hasBias);
+
+        if (!blobs.empty())
+        {
+            Mat w = hasWeights ? blobs[0] : Mat::ones(blobs[0].size(), CV_32F);
+            Mat b = hasBias ? blobs.back() : Mat::zeros(blobs.back().size(), CV_32F);
+
+            w = w.reshape(1, 1);
+            b = b.reshape(1, 1);
+
+            w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
+            addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
+        }
+        else
+        {
+            // initialized during forward()
+            weights = Mat(); bias = Mat();
+        }
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        Ptr<ActivationLayerInt8> activ_int8 = layer.dynamicCast<ActivationLayerInt8>();
+        if (!activ_int8.empty())
+        {
+            return activ_int8->blobs.empty();
+        }
+        return false;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        Mat &inpBlob = inputs[0];
+        Mat &outBlob = outputs[0];
+
+        if (blobs.empty())
+        {
+            CV_Assert(inp_sc.size() == 2 && inp_zp.size() == 2);
+            Mat inp_dequantized, w, b;
+            inputs[1].reshape(1, 1).convertTo(inp_dequantized, CV_32F, inp_sc[1], -(inp_sc[1]*inp_zp[1]));
+            w = hasWeights ? inp_dequantized : Mat::ones(inp_dequantized.size(), CV_32F);
+            b = hasBias ? inp_dequantized : Mat::zeros(inp_dequantized.size(), CV_32F);
+
+            w.convertTo(weights, CV_32F, inp_sc[0]/output_sc);
+            addWeighted(b, 1.0/output_sc, weights, -inp_zp[0], output_zp, bias, CV_32F);
+        }
+
+        MatShape inpShape = shape(inpBlob);
+        const int numWeights = weights.total();
+        CV_Assert(numWeights != 0);
+        CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
+
+        int endAxis;
+        for (endAxis = axis + 1; endAxis <= inpBlob.dims; ++endAxis)
+        {
+            if (total(inpShape, axis, endAxis) == numWeights)
+                break;
+        }
+        CV_Assert(total(inpShape, axis, endAxis) == numWeights);
+        CV_CheckTypeEQ(inpBlob.type(), CV_8SC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_8SC1, "");
+
+        int numSlices = total(inpShape, 0, axis);
+        int8_t* inpData = (int8_t*)inpBlob.data;
+        int8_t* outData = (int8_t*)outBlob.data;
+
+        if (endAxis != inpBlob.dims)
+        {
+            float* weightsData = (float*)weights.data;
+            float* biasesData = (float*)bias.data;
+            int spatialSize = total(inpShape, endAxis);  // spatialSize != 1
+            for (int i = 0; i < numSlices; ++i)
+            {
+                for (int j = 0; j < numWeights; ++j)
+                {
+                    float w = weightsData[j];
+                    float b = biasesData[j];
+                    Mat inpSlice(1, spatialSize, CV_8S, inpData);
+                    Mat outSlice(1, spatialSize, CV_8S, outData);
+                    inpSlice.convertTo(outSlice, CV_8S, w, b);
+                    inpData += spatialSize;
+                    outData += spatialSize;
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < numSlices; ++i)
+            {
+                Mat inpSlice(1, numWeights, CV_8S, inpData);
+                Mat outSlice(1, numWeights, CV_8S, outData);
+
+                multiply(inpSlice, weights, outSlice, 1.0, CV_8S);
+                add(outSlice, bias, outSlice, Mat(), CV_8S);
+
+                inpData += numWeights;
+                outData += numWeights;
+            }
+        }
+    }
+
+    void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        scale = (hasWeights && !blobs.empty()) ? blobs[0] : Mat();
+        shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
+    }
+
+    void getScaleZeropoint(float& scale, int& zeropoint) const CV_OVERRIDE
+    {
+        scale = output_sc;
+        zeropoint = output_zp;
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        long flops = 0;
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            flops += 2*total(inputs[i]);
+        }
+        return flops;
+    }
+
+private:
+    bool hasWeights;
+    std::vector<float> inp_sc;
+    std::vector<int> inp_zp;
+};
+
+
+Ptr<ScaleLayerInt8> ScaleLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(params));
+}
+
+Ptr<Layer> ShiftLayerInt8::create(const LayerParams& params)
+{
+    LayerParams scaleParams = params;
+    scaleParams.type = "ScaleInt8";
+    scaleParams.set("bias_term", true);
+    scaleParams.set("axis", 0);
+    return Ptr<ScaleLayerInt8>(new ScaleLayerInt8Impl(scaleParams));
+}
+
+}  // namespace dnn
+}  // namespace cv
diff --git a/modules/dnn/src/int8layers/softmax_layer.cpp b/modules/dnn/src/int8layers/softmax_layer.cpp
new file mode 100644
index 000000000000..7e3c82bc21ab
--- /dev/null
+++ b/modules/dnn/src/int8layers/softmax_layer.cpp
@@ -0,0 +1,176 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <algorithm>
+#include <stdlib.h>
+
+namespace cv
+{
+namespace dnn
+{
+
+class SoftMaxLayerInt8Impl CV_FINAL : public SoftmaxLayerInt8
+{
+public:
+
+    SoftMaxLayerInt8Impl(const LayerParams& params)
+    {
+        axisRaw = params.get<int>("axis", 1);
+        logSoftMax = params.get<bool>("log_softmax", false);
+        output_sc = params.get<float>("scales");
+        output_zp = params.get<int>("zeropoints");
+        setParamsFrom(params);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        bool inplace = Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        MatShape shape = inputs[0];
+        int cAxis = normalize_axis(axisRaw, shape.size());
+        shape[cAxis] = 1;
+        internals.assign(1, shape);
+        return inplace;
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        return backendId == DNN_BACKEND_OPENCV;
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        Ptr<DequantizeLayer> dequantize_layer = top.dynamicCast<DequantizeLayer>();
+        return !dequantize_layer.empty() && preferableTarget != DNN_TARGET_OPENCL_FP16;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+        internals_arr.getMatVector(internals);
+
+        const Mat &src = inputs[0];
+        Mat &dst = outputs[0];
+
+        int axis = normalize_axis(axisRaw, src.dims);
+        size_t outerSize = src.total(0, axis), channels = src.size[axis],
+               innerSize = src.total(axis + 1);
+
+        CV_Assert(src.type() == CV_8S && (dst.type() == CV_8S || dst.type() == CV_32F));
+        CV_Assert(src.isContinuous() && dst.isContinuous());
+
+        size_t outerStep = src.total(axis);
+        size_t cnStep = src.total(axis + 1);
+        const int8_t *srcPtr = src.ptr<int8_t>();
+        const float *expPtr = blobs[0].ptr<float>();
+
+        if (dst.type() == CV_32F)
+        {
+            float *dstPtr = dst.ptr<float>();
+            for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+            {
+                size_t srcOffset = outerDim * outerStep;
+                std::vector<float> expSum(innerSize, 0.f);
+
+                // sum exp along axis
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        expSum[i] += expPtr[srcPtr[offset + i] + 128];
+                }
+
+                // divide by computed sum
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        dstPtr[offset + i] = expPtr[srcPtr[offset + i] + 128]/expSum[i];
+                }
+
+                if (logSoftMax)
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = log(dstPtr[offset + i]);
+                    }
+                }
+            }
+        }
+        else
+        {
+            const float inv_scale = 1.f/output_sc;
+            int8_t *dstPtr = dst.ptr<int8_t>();
+            for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+            {
+                size_t srcOffset = outerDim * outerStep;
+                std::vector<float> expSum(innerSize, 0.f);
+
+                // sum exp along axis
+                for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                {
+                    const int offset = srcOffset + cnDim * cnStep;
+                    for (size_t i = 0; i < innerSize; i++)
+                        expSum[i] += expPtr[srcPtr[offset + i] + 128];
+                }
+
+                // divide by computed sum and quantize to int8
+                if (logSoftMax)
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*log(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
+                    }
+                }
+                else
+                {
+                    for (size_t cnDim = 0; cnDim < channels; cnDim++)
+                    {
+                        const int offset = srcOffset + cnDim * cnStep;
+                        for (size_t i = 0; i < innerSize; i++)
+                            dstPtr[offset + i] = saturate_cast<int8_t>(output_zp + std::round(inv_scale*(expPtr[srcPtr[offset + i] + 128]/expSum[i])));
+                    }
+                }
+            }
+        }
+    }
+
+    int64 getFLOPS(const std::vector<MatShape> &inputs,
+                  const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_UNUSED(outputs); // suppress unused variable warning
+        int64 flops = 0;
+
+        for (int i = 0; i < inputs.size(); i++)
+        {
+            flops += 4*total(inputs[i]);
+        }
+
+        return flops;
+    }
+
+    int axisRaw;
+};
+
+Ptr<SoftmaxLayerInt8> SoftmaxLayerInt8::create(const LayerParams& params)
+{
+    return Ptr<SoftmaxLayerInt8>(new SoftMaxLayerInt8Impl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 4a0aa6ad0a12..6ab260f22dba 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -36,6 +36,7 @@ namespace dnn
 class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
 {
 public:
+    Mat origin_weights, origin_bias;
     Mat weights_, bias_;
     UMat umat_weight, umat_bias;
     mutable int dims;
@@ -89,11 +90,11 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
         const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
         const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
 
-        weights_.create(1, (int)n, CV_32F);
-        bias_.create(1, (int)n, CV_32F);
+        origin_weights.create(1, (int)n, CV_32F);
+        origin_bias.create(1, (int)n, CV_32F);
 
-        float* dstWeightsData = weights_.ptr<float>();
-        float* dstBiasData = bias_.ptr<float>();
+        float* dstWeightsData = origin_weights.ptr<float>();
+        float* dstBiasData = origin_bias.ptr<float>();
 
         for (size_t i = 0; i < n; ++i)
         {
@@ -101,15 +102,12 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
             dstWeightsData[i] = w;
             dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
         }
-        // We will use blobs to store origin weights and bias to restore them in case of reinitialization.
-        weights_.copyTo(blobs[0].reshape(1, 1));
-        bias_.copyTo(blobs[1].reshape(1, 1));
     }
 
     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
     {
-        blobs[0].reshape(1, 1).copyTo(weights_);
-        blobs[1].reshape(1, 1).copyTo(bias_);
+        origin_weights.reshape(1, 1).copyTo(weights_);
+        origin_bias.reshape(1, 1).copyTo(bias_);
     }
 
     void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
@@ -434,6 +432,18 @@ class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scale", scales[0][0]);
+        params.set("input_zeropoint", zeropoints[0][0]);
+
+        params.blobs.clear();
+        params.blobs.push_back(origin_weights);
+        params.blobs.push_back(origin_bias);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp
index 5f93b458869d..59548a9c0c51 100644
--- a/modules/dnn/src/layers/blank_layer.cpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -166,6 +166,11 @@ class BlankLayerImpl CV_FINAL : public BlankLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 };
 
 Ptr<Layer> BlankLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index e7701d6efd4c..763138b7bb25 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -71,6 +71,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
         setParamsFrom(params);
         axis = params.get<int>("axis", 1);
         padding = params.get<bool>("padding", false);
+        paddingValue = params.get<int>("padding_value", 0);
     }
 
     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -121,13 +122,14 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
                (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding);
     }
 
+    template <class T>
     class ChannelConcatInvoker : public ParallelLoopBody
     {
     public:
         std::vector<Mat>* inputs;
         Mat* output;
         int nstripes;
-        std::vector<const float*> chptrs;
+        std::vector<const T*> chptrs;
 
         static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
         {
@@ -141,14 +143,14 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
             for( i = 0; i < ninputs; i++ )
             {
                 Mat& inp = inputs[i];
-                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S) &&
+                CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
                            inp.dims == 4 && inp.size[0] == output.size[0] &&
                            inp.size[2] == output.size[2] &&
                            inp.size[3] == output.size[3] );
                 nchannels += inp.size[1];
             }
             CV_Assert( nchannels == output.size[1] );
-            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S) );
+            CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
 
             cc.chptrs.resize(nchannels*batchsz);
 
@@ -159,7 +161,7 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
                 for( int j = 0; j < batchsz; j++ )
                     for( int k = 0; k < inp.size[1]; k++ )
                     {
-                        const float* ptr = inp.ptr<float>(j, k);
+                        const T* ptr = inp.ptr<T>(j, k);
                         cc.chptrs[ofs + j*nchannels + k] = ptr;
                     }
                 ofs += inp.size[1];
@@ -178,8 +180,8 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
             size_t stripeSize = (total + nstripes - 1)/nstripes;
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(total, r.end*stripeSize);
-            const float** ptrs = (const float**)&chptrs[0];
-            float* outptr = output->ptr<float>();
+            const T** ptrs = (const T**)&chptrs[0];
+            T* outptr = output->ptr<T>();
             size_t blockSize0 = 1 << 16;
 
             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
@@ -250,7 +252,8 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
         std::vector<Mat> inputs, outputs;
@@ -261,12 +264,15 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
         Mat& outMat = outputs[0];
 
         if (padding)
-            outMat.setTo(0);
+            outMat.setTo(paddingValue);
 
         if( cAxis == 1 && outMat.dims == 4 && !padding)
         {
             int nstripes = getNumThreads();
-            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+            if (outMat.type() == CV_8S)
+                ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
+            else
+                ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
         }
         else
         {
@@ -412,6 +418,13 @@ class ConcatLayerImpl CV_FINAL : public ConcatLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (padding)
+            params.set("padding_value", zeropoints[1][0]);
+        return true;
+    }
 };
 
 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp
index 921d5801d15b..1f307b8fa6aa 100644
--- a/modules/dnn/src/layers/const_layer.cpp
+++ b/modules/dnn/src/layers/const_layer.cpp
@@ -124,6 +124,15 @@ class ConstLayerImpl CV_FINAL : public ConstLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        Mat quantizedBlob;
+        blobs[0].convertTo(quantizedBlob, CV_8S, 1.f/scales[1][0], zeropoints[1][0]);
+        params.blobs.clear();
+        params.blobs.push_back(quantizedBlob);
+        return true;
+    }
 };
 
 Ptr<Layer> ConstLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 90d0272c7063..0b550edbab38 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -1033,11 +1033,12 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         bool useAVX;
         bool useAVX2;
         bool useAVX512;
+        bool useRVV;
         int blk_size_cn;
 
         ParallelConv()
             : input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
-              biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false)
+              biasvec_(0), reluslope_(0), activ_(0), is1x1_(false), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false)
             , blk_size_cn(0)
         {}
 
@@ -1095,6 +1096,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             p.useAVX    = checkHardwareSupport(CPU_AVX)  && isConv2D;
             p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
+            p.useRVV   = checkHardwareSupport(CPU_RVV) && isConv2D;
 
             int kernel_d = isConv3D? kernel_size[0] : 1;
             int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
@@ -1295,6 +1297,13 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
                                     stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
                                     biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
                             else
+                        #endif
+                        #if CV_TRY_RVV
+                            if(useRVV)
+                                opt_RVV::fastDepthwiseConv(wptr, kernel_h, kernel_w,
+                                    stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
+                                    biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
+                            else
                         #endif
                             {
                                 const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -1665,6 +1674,12 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
                             opt_AVX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
                                          outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
                         else
+                    #endif
+                    #if CV_TRY_RVV
+                        if(useRVV)
+                            opt_RVV::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
+                                         outShape, bsz, vsz, vsz_a, relu, cn0 == 0);
+                        else
                     #endif
                         for( int i = 0; i < outCn; i += 2 )
                         {
@@ -2187,6 +2202,48 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        // References - https://arxiv.org/pdf/1712.05877.pdf
+
+        // Quantized convolution with variable weights is not supported.
+        if (blobs.empty())
+            return false;
+
+        float inputScale = scales[0][0], outputScale = scales[1][0];
+        int inputZp = zeropoints[0][0];
+        params.set("input_zeropoint", inputZp);
+
+        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
+        Mat biasQuantized(1, numOutput, CV_32S);
+        Mat outputMultiplier(1, numOutput, CV_32F);
+        double realMin, realMax, weightsScale;
+
+        for( int i = 0; i < numOutput; i++ )
+        {
+            // Quantize weights
+            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
+            realMin = std::min(realMin, 0.0);
+            realMax = std::max(realMax, 0.0);
+            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+
+            // Quantize biases
+            float biasScale = inputScale * weightsScale;
+            biasQuantized.at<int>(i) = (int)std::round(biasvec[i]/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+
+            // Store multiplier
+            outputMultiplier.at<float>(i) = biasScale / outputScale;
+        }
+
+        params.blobs.clear();
+        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
+        params.blobs.push_back(biasQuantized);
+        params.blobs.push_back(outputMultiplier);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
@@ -2416,6 +2473,7 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             useAVX = checkHardwareSupport(CPU_AVX);
             useAVX2 = checkHardwareSupport(CPU_AVX2);
             useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+            useRVV = checkHardwareSupport(CPU_RVV);
         }
 
         void operator()(const Range& range_) const CV_OVERRIDE
@@ -2447,6 +2505,12 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
             if( useAVX )
                 opt_AVX::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
             else
+        #endif
+        #if CV_TRY_RVV
+            if( useRVV ) {
+                opt_RVV::fastGEMM( aptr, astep, bptr, bstep, cptr, cstep, mmax, kmax, nmax );
+            }
+            else
         #endif
             for( m = 0; m < mmax; m += 2 )
             {
@@ -2546,6 +2610,7 @@ class DeConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
         bool useAVX;
         bool useAVX2;
         bool useAVX512;
+        bool useRVV;
     };
 
     class Col2ImInvoker : public cv::ParallelLoopBody
diff --git a/modules/dnn/src/layers/cumsum_layer.cpp b/modules/dnn/src/layers/cumsum_layer.cpp
new file mode 100644
index 000000000000..9c70f306d486
--- /dev/null
+++ b/modules/dnn/src/layers/cumsum_layer.cpp
@@ -0,0 +1,131 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace cv
+{
+namespace dnn
+{
+
+class CumSumLayerImpl CV_FINAL : public CumSumLayer
+{
+public:
+    CumSumLayerImpl(const LayerParams &params)
+    {
+        axis_raw = params.get<int>("axis", 0);
+        exclusive_raw = params.get<int>("exclusive", 0);
+        reverse_raw = params.get<int>("reverse", 0);
+        setParamsFrom(params);
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
+        return true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> inputs, outputs, internals;
+        inputs_arr.getMatVector(inputs);
+        outputs_arr.getMatVector(outputs);
+
+        // Get x tensor.
+        const auto &src_mat = inputs[0];
+        const auto *src_ptr = src_mat.ptr<float>();
+
+        // Get axis.
+        const int axis = normalize_axis(axis_raw, src_mat.dims);
+
+        // Get y tensor.
+        auto &dst_mat = outputs[0];
+        src_mat.copyTo(dst_mat);
+        auto *dst_ptr = dst_mat.ptr<float>();
+
+        // Get flags.
+        const auto exclusive = exclusive_raw == 1;
+        const auto reverse = reverse_raw == 1;
+
+        // Get parameters to iterate outer dimension.
+        const size_t outer_size = src_mat.total(0, axis);
+        const size_t outer_step_length = src_mat.total(axis);
+
+        // Get parameters to iterate inner dimension.
+        const size_t inner_size = src_mat.size[axis];
+
+        if (!inner_size)
+            return;
+
+        const size_t inner_step_length = src_mat.total(axis + 1);
+        const int inner_step = (reverse ? -1 : 1) * inner_step_length;
+        const int inner_start = reverse ? inner_size - 1 : 0;
+        const int inner_stop = reverse ? -1 : inner_size;
+        const int inner_delta = reverse ? -1 : 1;
+
+        // Get parameters to populate channels.
+        const size_t num_channels = src_mat.total(axis + 1);
+
+        for (size_t outer_dim = 0; outer_dim < outer_size; outer_dim++)
+        {
+            const size_t outer_offset = outer_dim * outer_step_length;
+            size_t src_offset = outer_offset + inner_start * inner_step_length;
+
+            // Populate first element of inner dimension.
+            for (size_t channel = 0; channel < num_channels; channel++)
+            {
+                if (exclusive)
+                {
+                    dst_ptr[src_offset + channel] = 0.0f;
+                }
+                else
+                {
+                    dst_ptr[src_offset + channel] = src_ptr[src_offset + channel];
+                    src_offset += inner_step;
+                }
+            }
+
+            // Populate remaining elements of inner dimension.
+            for (int inner_dim = inner_start + inner_delta; inner_dim != inner_stop; inner_dim += inner_delta)
+            {
+                const size_t dst_offset = outer_offset + inner_dim * inner_step_length;
+
+                for (size_t channel = 0; channel < num_channels; channel++)
+                {
+                    const size_t previous_dst_offset = dst_offset - inner_step;
+                    dst_ptr[dst_offset + channel] = dst_ptr[previous_dst_offset + channel] +
+                            src_ptr[src_offset + channel];
+                    src_offset += inner_step;
+                }
+            }
+        }
+    }
+
+    int axis_raw;
+    int exclusive_raw;
+    int reverse_raw;
+};
+
+Ptr<CumSumLayer> CumSumLayer::create(const LayerParams& params)
+{
+    return Ptr<CumSumLayer>(new CumSumLayerImpl(params));
+}
+
+}
+}
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index de97c873af39..77d86d5652e5 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -462,7 +462,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
             // Retrieve all prior bboxes
             std::vector<util::NormalizedBBox> priorBBoxes;
             std::vector<std::vector<float> > priorVariances;
-            GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, priorBBoxes, priorVariances);
+            GetPriorBBoxes(priorData, numPriors, _bboxesNormalized, _varianceEncodedInTarget, priorBBoxes, priorVariances);
 
             // Decode all loc predictions to bboxes
             util::NormalizedBBox clipBounds;
@@ -756,7 +756,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
         CV_Assert(prior_bboxes.size() == prior_variances.size());
         CV_Assert(prior_bboxes.size() == bboxes.size());
         size_t num_bboxes = prior_bboxes.size();
-        CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4);
+        CV_Assert(num_bboxes == 0 || prior_variances[0].size() == 4 || variance_encoded_in_target);
         decode_bboxes.clear(); decode_bboxes.resize(num_bboxes);
         if(variance_encoded_in_target)
         {
@@ -808,12 +808,13 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
     }
 
     // Get prior bounding boxes from prior_data
-    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
+    //    prior_data: 1 x 1 x num_priors * 4 x 1 blob or 1 x 2 x num_priors * 4 x 1 blob.
     //    num_priors: number of priors.
     //    prior_bboxes: stores all the prior bboxes in the format of util::NormalizedBBox.
     //    prior_variances: stores all the variances needed by prior bboxes.
     static void GetPriorBBoxes(const float* priorData, const int& numPriors,
-                        bool normalized_bbox, std::vector<util::NormalizedBBox>& priorBBoxes,
+                        bool normalized_bbox, bool variance_encoded_in_target,
+                        std::vector<util::NormalizedBBox>& priorBBoxes,
                         std::vector<std::vector<float> >& priorVariances)
     {
         priorBBoxes.clear(); priorBBoxes.resize(numPriors);
@@ -829,13 +830,16 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
             bbox.set_size(BBoxSize(bbox, normalized_bbox));
         }
 
-        for (int i = 0; i < numPriors; ++i)
+        if (!variance_encoded_in_target)
         {
-            int startIdx = (numPriors + i) * 4;
-            // not needed here: priorVariances[i].clear();
-            for (int j = 0; j < 4; ++j)
+            for (int i = 0; i < numPriors; ++i)
             {
-                priorVariances[i].push_back(priorData[startIdx + j]);
+                int startIdx = (numPriors + i) * 4;
+                // not needed here: priorVariances[i].clear();
+                for (int j = 0; j < 4; ++j)
+                {
+                    priorVariances[i].push_back(priorData[startIdx + j]);
+                }
             }
         }
     }
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index ba1fac1d63be..528d062c385c 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -269,6 +269,12 @@ class ElementWiseLayer : public Func::Layer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return func.tryQuantize(scales, zeropoints, params);
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
@@ -302,6 +308,8 @@ struct BaseFunctor
     bool tryFuse(Ptr<dnn::Layer>&) { return false; }
 
     void getScaleShift(Mat&, Mat&) const {}
+
+    bool tryQuantize(const std::vector<std::vector<float>>&, const std::vector<std::vector<int>>&, LayerParams&) { return false; }
 };
 
 struct ReLUFunctor : public BaseFunctor
@@ -467,6 +475,29 @@ struct ReLUFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        if (slope != 0.f)
+        {
+            float inpScale = scales[0][0], outScale = scales[1][0];
+            int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+            Mat lookUpTable(1, 256, CV_8S);
+            int8_t* table = lookUpTable.ptr<int8_t>();
+            for (int i = -128; i < 128; i++)
+            {
+                float x = inpScale*(i - inpZp);
+                float y = x >= 0.f ? x : slope*x;
+                int quantized = outZp + (int)std::round(y/outScale);
+                table[i+128] = saturate_cast<int8_t>(quantized);
+            }
+            params.blobs.clear();
+            params.blobs.push_back(lookUpTable);
+        }
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -603,6 +634,12 @@ struct ReLU6Functor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 2; }
 };
 
@@ -704,6 +741,26 @@ struct TanHFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = tanh(x);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -805,6 +862,26 @@ struct SwishFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x / (1.0f + exp(-x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -919,6 +996,28 @@ struct MishFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float eX = exp(x);
+            float n = (eX + 2) * eX;
+            float y = (x * n) / (n + 2);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -1020,6 +1119,26 @@ struct SigmoidFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = 1.f/(1.f + exp(-x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 3; }
 };
 
@@ -1121,6 +1240,26 @@ struct ELUFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x >= 0.f ? x : exp(x) - 1;
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 2; }
 };
 
@@ -1228,6 +1367,26 @@ struct AbsValFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = abs(x);
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 1; }
 };
 
@@ -1330,6 +1489,26 @@ struct BNLLFunctor : public BaseFunctor
     }
 #endif  // HAVE_VULKAN
 
+    bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                     const std::vector<std::vector<int> > &zeropoints, LayerParams& params)
+    {
+        float inpScale = scales[0][0], outScale = scales[1][0];
+        int inpZp = zeropoints[0][0], outZp = zeropoints[1][0];
+
+        Mat lookUpTable(1, 256, CV_8S);
+        int8_t* table = lookUpTable.ptr<int8_t>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - inpZp);
+            float y = x > 0 ? x + log(1. + exp(-x)) : log(1. + exp(x));
+            int quantized = outZp + (int)std::round(y/outScale);
+            table[i+128] = saturate_cast<int8_t>(quantized);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPSPerElement() const { return 5; }
 };
 
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index a337c48d9e69..860560213d92 100644
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -864,6 +864,37 @@ class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
     }
 #endif  // HAVE_DNN_NGRAPH
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (op == SUM)
+        {
+            std::vector<float> newCoeffs;
+            float offset = zeropoints[1][0];
+            float out_sc = scales[1][0];
+            for (int i = 0; i < scales[0].size(); i++)
+            {
+                float coeff = coeffs.empty() ? 1.f : coeffs[i];
+                float newcoeff = (scales[0][i] * coeff) / out_sc;
+                newCoeffs.push_back(newcoeff);
+                offset -= (newcoeff * zeropoints[0][i]);
+            }
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", offset);
+            return true;
+        }
+        else if (op == PROD)
+        {
+            std::vector<float> newCoeffs = scales[0];
+            newCoeffs[0] /= scales[1][0];
+            params.set("coeff", DictValue::arrayReal(newCoeffs.data(), newCoeffs.size()));
+            params.set("offset", zeropoints[1][0]);
+            params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+            return true;
+        }
+        return op == MAX;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index 7cf01a14fa33..8ff862fab030 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -227,6 +227,11 @@ virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inp
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     int _startAxis;
     int _endAxis;
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index 2813b037f851..8abda0982852 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -170,7 +170,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
     class FullyConnected : public ParallelLoopBody
     {
     public:
-        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
+        FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false), useRVV(false) {}
 
         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
                         Mat& dstMat, const ActivationLayer* activ, int nstripes)
@@ -193,6 +193,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
             p.useAVX = checkHardwareSupport(CPU_AVX);
             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
+            p.useRVV = checkHardwareSupport(CPU_RVV);
 
             parallel_for_(Range(0, nstripes), p, nstripes);
         }
@@ -241,6 +242,11 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
                 if( useAVX )
                     opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
                 else
+            #endif
+            #if CV_TRY_RVV
+                if( useRVV )
+                    opt_RVV::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
+                else
             #endif
                 {
                     int i = 0;
@@ -295,6 +301,7 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
         bool useAVX;
         bool useAVX2;
         bool useAVX512;
+        bool useRVV;
     };
 
 #ifdef HAVE_OPENCL
@@ -648,6 +655,45 @@ class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
     }
 #endif // HAVE_WEBNN
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (blobs.empty())
+            return false;
+
+        int numOutput = blobs[0].size[0];
+        float inputScale = scales[0][0], outputScale = scales[1][0];
+        int inputZp = zeropoints[0][0];
+
+        Mat weightsQuantized(weightsMat.rows, weightsMat.cols, CV_8S);
+        Mat biasQuantized(1, numOutput, CV_32S);
+        Mat outputMultiplier(1, numOutput, CV_32F);
+
+        double realMin, realMax, weightsScale;
+        for( int i = 0; i < numOutput; i++ )
+        {
+            // Quantize weights
+            cv::minMaxIdx(weightsMat.row(i), &realMin, &realMax);
+            realMin = std::min(realMin, 0.0);
+            realMax = std::max(realMax, 0.0);
+            weightsScale = (realMax == realMin) ? 1.0 : std::max(-realMin, realMax)/127;
+            weightsMat.row(i).convertTo(weightsQuantized.row(i), CV_8S, 1.f/weightsScale);
+
+            // Quantize biases
+            float biasScale = inputScale * weightsScale;
+            biasQuantized.at<int>(i) = (int)std::round(biasMat.at<float>(i)/biasScale) - inputZp*(cv::sum(weightsQuantized.row(i))[0]);
+
+            // Store multiplier
+            outputMultiplier.at<float>(i) = biasScale / outputScale;
+        }
+
+        params.blobs.clear();
+        params.blobs.push_back(weightsQuantized.reshape(1, shape(blobs[0])));
+        params.blobs.push_back(biasQuantized);
+        params.blobs.push_back(outputMultiplier);
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/layers_common.simd.hpp b/modules/dnn/src/layers/layers_common.simd.hpp
index 706695a7b20f..762e22e54d2f 100644
--- a/modules/dnn/src/layers/layers_common.simd.hpp
+++ b/modules/dnn/src/layers/layers_common.simd.hpp
@@ -737,5 +737,554 @@ void fastGEMM( const float* aptr, size_t astep, const float* bptr,
 
 #endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
 
+#if !defined(CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY) && CV_RVV
+
+void fastGEMM( const float* aptr, size_t astep, const float* bptr,
+               size_t bstep, float* cptr, size_t cstep,
+               int ma, int na, int nb )
+{
+    int n = 0;
+    size_t vl = 8;
+    size_t mvl0 = 8;
+    size_t mvl1 = 8;
+    for( ; n < nb; n += 16 )
+    {
+        if ( n + 16 > nb) {
+            mvl0 = nb - n;
+            mvl1 = (nb - n -8) > 0 ? (nb - n -8) : 0;
+        }
+
+        for( int m = 0; m < ma; m += 4 )
+        {
+            const float* aptr0 = aptr + astep*m;
+            const float* aptr1 = aptr + astep*std::min(m+1, ma-1);
+            const float* aptr2 = aptr + astep*std::min(m+2, ma-1);
+            const float* aptr3 = aptr + astep*std::min(m+3, ma-1);
+
+            float* cptr0 = cptr + cstep*m;
+            float* cptr1 = cptr + cstep*std::min(m+1, ma-1);
+            float* cptr2 = cptr + cstep*std::min(m+2, ma-1);
+            float* cptr3 = cptr + cstep*std::min(m+3, ma-1);
+
+            vfloat32m2_t d00 = vfmv_v_f_f32m2(0, vl), d01 = vfmv_v_f_f32m2(0, vl);
+            vfloat32m2_t d10 = vfmv_v_f_f32m2(0, vl), d11 = vfmv_v_f_f32m2(0, vl);
+            vfloat32m2_t d20 = vfmv_v_f_f32m2(0, vl), d21 = vfmv_v_f_f32m2(0, vl);
+            vfloat32m2_t d30 = vfmv_v_f_f32m2(0, vl), d31 = vfmv_v_f_f32m2(0, vl);
+
+            for( int k = 0; k < na; k++ )
+            {
+                vfloat32m2_t a0 = vfmv_v_f_f32m2(aptr0[k], vl);
+                vfloat32m2_t a1 = vfmv_v_f_f32m2(aptr1[k], vl);
+                vfloat32m2_t a2 = vfmv_v_f_f32m2(aptr2[k], vl);
+                vfloat32m2_t a3 = vfmv_v_f_f32m2(aptr3[k], vl);
+                vfloat32m2_t b0 = vle32_v_f32m2(bptr + k*bstep + n, mvl0);
+                vfloat32m2_t b1 = vle32_v_f32m2(bptr + k*bstep + n + 8, mvl1);
+                d00 = vfmacc_vv_f32m2(d00, a0, b0, mvl0);
+                d01 = vfmacc_vv_f32m2(d01, a0, b1, mvl1);
+                d10 = vfmacc_vv_f32m2(d10, a1, b0, mvl0);
+                d11 = vfmacc_vv_f32m2(d11, a1, b1, mvl1);
+                d20 = vfmacc_vv_f32m2(d20, a2, b0, mvl0);
+                d21 = vfmacc_vv_f32m2(d21, a2, b1, mvl1);
+                d30 = vfmacc_vv_f32m2(d30, a3, b0, mvl0);
+                d31 = vfmacc_vv_f32m2(d31, a3, b1, mvl1);
+            }
+            vse32_v_f32m2(cptr0 + n, d00, mvl0);
+            vse32_v_f32m2(cptr1 + n, d10, mvl0);
+            vse32_v_f32m2(cptr2 + n, d20, mvl0);
+            vse32_v_f32m2(cptr3 + n, d30, mvl0);
+            vse32_v_f32m2(cptr0 + n + 8, d01, mvl1);
+            vse32_v_f32m2(cptr1 + n + 8, d11, mvl1);
+            vse32_v_f32m2(cptr2 + n + 8, d21, mvl1);
+            vse32_v_f32m2(cptr3 + n + 8, d31, mvl1);
+        }
+    }
+}
+
+void fastGEMM1T( const float* vec, const float* weights,
+                 size_t wstep, const float* bias,
+                 float* dst, int nvecs, int vecsize )
+{
+    int i = 0;
+    size_t vl = 8;
+    for( ; i <= nvecs - 8; i += 8 )
+    {
+        const float* wptr = weights + i*wstep;
+        vfloat32m2_t vs0 = vfmv_v_f_f32m2(0, vl), vs1 = vfmv_v_f_f32m2(0, vl),
+               vs2 = vfmv_v_f_f32m2(0, vl), vs3 = vfmv_v_f_f32m2(0, vl),
+               vs4 = vfmv_v_f_f32m2(0, vl), vs5 = vfmv_v_f_f32m2(0, vl),
+               vs6 = vfmv_v_f_f32m2(0, vl), vs7 = vfmv_v_f_f32m2(0, vl);
+
+        for( int k = 0; k < vecsize; k += 8, wptr += 8 )
+        {
+            vfloat32m2_t v = vle32_v_f32m2(vec + k, vl);
+
+            vs0 = vfmacc_vv_f32m2(vs0, vle32_v_f32m2(wptr, vl), v, vl);
+            vs1 = vfmacc_vv_f32m2(vs1, vle32_v_f32m2(wptr + wstep, vl), v, vl);
+            vs2 = vfmacc_vv_f32m2(vs2, vle32_v_f32m2(wptr + wstep*2, vl), v, vl);
+            vs3 = vfmacc_vv_f32m2(vs3, vle32_v_f32m2(wptr + wstep*3, vl), v, vl);
+            vs4 = vfmacc_vv_f32m2(vs4, vle32_v_f32m2(wptr + wstep*4, vl), v, vl);
+            vs5 = vfmacc_vv_f32m2(vs5, vle32_v_f32m2(wptr + wstep*5, vl), v, vl);
+            vs6 = vfmacc_vv_f32m2(vs6, vle32_v_f32m2(wptr + wstep*6, vl), v, vl);
+            vs7 = vfmacc_vv_f32m2(vs7, vle32_v_f32m2(wptr + wstep*7, vl), v, vl);
+        }
+
+        // Calculate the sum of each vector
+        vfloat32m1_t zero = vfmv_v_f_f32m1(0, vl);
+        vfloat32m1_t temp0 = vfredsum_vs_f32m2_f32m1(temp0, vs0, zero, vl);
+        vfloat32m1_t temp1 = vfredsum_vs_f32m2_f32m1(temp1, vs1, zero, vl);
+        vfloat32m1_t temp2 = vfredsum_vs_f32m2_f32m1(temp2, vs2, zero, vl);
+        vfloat32m1_t temp3 = vfredsum_vs_f32m2_f32m1(temp3, vs3, zero, vl);
+        vfloat32m1_t temp4 = vfredsum_vs_f32m2_f32m1(temp4, vs4, zero, vl);
+        vfloat32m1_t temp5 = vfredsum_vs_f32m2_f32m1(temp5, vs5, zero, vl);
+        vfloat32m1_t temp6 = vfredsum_vs_f32m2_f32m1(temp6, vs6, zero, vl);
+        vfloat32m1_t temp7 = vfredsum_vs_f32m2_f32m1(temp7, vs7, zero, vl);
+        float32_t sum[8];
+        sum[0] = vfmv_f_s_f32m1_f32(temp0);
+        sum[1] = vfmv_f_s_f32m1_f32(temp1);
+        sum[2] = vfmv_f_s_f32m1_f32(temp2);
+        sum[3] = vfmv_f_s_f32m1_f32(temp3);
+        sum[4] = vfmv_f_s_f32m1_f32(temp4);
+        sum[5] = vfmv_f_s_f32m1_f32(temp5);
+        sum[6] = vfmv_f_s_f32m1_f32(temp6);
+        sum[7] = vfmv_f_s_f32m1_f32(temp7);
+        vfloat32m2_t s0 = vfadd_vv_f32m2(vle32_v_f32m2(sum, vl), vle32_v_f32m2(bias + i, vl), vl);
+        vse32_v_f32m2(dst + i, s0, vl);
+    }
+    int mvl = nvecs - i;
+    if (mvl > 0)
+    {
+        const float* wptr = weights + i*wstep;
+        vfloat32m2_t vs0 = vfmv_v_f_f32m2(0, vl), vs1 = vfmv_v_f_f32m2(0, vl),
+               vs2 = vfmv_v_f_f32m2(0, vl), vs3 = vfmv_v_f_f32m2(0, vl),
+               vs4 = vfmv_v_f_f32m2(0, vl), vs5 = vfmv_v_f_f32m2(0, vl),
+               vs6 = vfmv_v_f_f32m2(0, vl), vs7 = vfmv_v_f_f32m2(0, vl);
+        int k = 0;
+        for( ; k <= vecsize - 8; k += 8, wptr += 8 )
+        {
+            vfloat32m2_t v = vle32_v_f32m2(vec + k, vl);
+            vs0 = vfmacc_vv_f32m2(vs0, vle32_v_f32m2(wptr, vl), v, vl);
+            vs1 = vfmacc_vv_f32m2(vs1, vle32_v_f32m2(wptr + wstep*std::min(1, mvl-1), vl), v, vl);
+            vs2 = vfmacc_vv_f32m2(vs2, vle32_v_f32m2(wptr + wstep*std::min(2, mvl-1), vl), v, vl);
+            vs3 = vfmacc_vv_f32m2(vs3, vle32_v_f32m2(wptr + wstep*std::min(3, mvl-1), vl), v, vl);
+            vs4 = vfmacc_vv_f32m2(vs4, vle32_v_f32m2(wptr + wstep*std::min(4, mvl-1), vl), v, vl);
+            vs5 = vfmacc_vv_f32m2(vs5, vle32_v_f32m2(wptr + wstep*std::min(5, mvl-1), vl), v, vl);
+            vs6 = vfmacc_vv_f32m2(vs6, vle32_v_f32m2(wptr + wstep*std::min(6, mvl-1), vl), v, vl);
+        }
+        int kvl = vecsize - k;
+        if (kvl > 0) {
+            vfloat32m2_t v = vle32_v_f32m2(vec + k, kvl);
+            vs0 = vfmacc_vv_f32m2(vs0, vle32_v_f32m2(wptr, kvl), v, kvl);
+            vs1 = vfmacc_vv_f32m2(vs1, vle32_v_f32m2(wptr + wstep*std::min(1, mvl-1), kvl), v, kvl);
+            vs2 = vfmacc_vv_f32m2(vs2, vle32_v_f32m2(wptr + wstep*std::min(2, mvl-1), kvl), v, kvl);
+            vs3 = vfmacc_vv_f32m2(vs3, vle32_v_f32m2(wptr + wstep*std::min(3, mvl-1), kvl), v, kvl);
+            vs4 = vfmacc_vv_f32m2(vs4, vle32_v_f32m2(wptr + wstep*std::min(4, mvl-1), kvl), v, kvl);
+            vs5 = vfmacc_vv_f32m2(vs5, vle32_v_f32m2(wptr + wstep*std::min(5, mvl-1), kvl), v, kvl);
+            vs6 = vfmacc_vv_f32m2(vs6, vle32_v_f32m2(wptr + wstep*std::min(6, mvl-1), kvl), v, kvl);
+        }
+        // Calculate the sum of each vector
+        vfloat32m1_t zero = vfmv_v_f_f32m1(0, vl);
+        vfloat32m1_t temp0 = vfmv_v_f_f32m1(0, 4), temp1 = vfmv_v_f_f32m1(0, 4),
+                temp2 = vfmv_v_f_f32m1(0, 4), temp3 = vfmv_v_f_f32m1(0, 4),
+                temp4 = vfmv_v_f_f32m1(0, 4), temp5 = vfmv_v_f_f32m1(0, 4),
+                temp6 = vfmv_v_f_f32m1(0, 4), temp7 = vfmv_v_f_f32m1(0, 4);
+        temp0 = vfredsum_vs_f32m2_f32m1(temp0, vs0, zero, vl);
+        temp1 = vfredsum_vs_f32m2_f32m1(temp1, vs1, zero, vl);
+        temp2 = vfredsum_vs_f32m2_f32m1(temp2, vs2, zero, vl);
+        temp3 = vfredsum_vs_f32m2_f32m1(temp3, vs3, zero, vl);
+        temp4 = vfredsum_vs_f32m2_f32m1(temp4, vs4, zero, vl);
+        temp5 = vfredsum_vs_f32m2_f32m1(temp5, vs5, zero, vl);
+        temp6 = vfredsum_vs_f32m2_f32m1(temp6, vs6, zero, vl);
+        temp7 = vfredsum_vs_f32m2_f32m1(temp7, vs7, zero, vl);
+
+        float32_t sum[8];
+        sum[0] = vfmv_f_s_f32m1_f32(temp0);
+        sum[1] = vfmv_f_s_f32m1_f32(temp1);
+        sum[2] = vfmv_f_s_f32m1_f32(temp2);
+        sum[3] = vfmv_f_s_f32m1_f32(temp3);
+        sum[4] = vfmv_f_s_f32m1_f32(temp4);
+        sum[5] = vfmv_f_s_f32m1_f32(temp5);
+        sum[6] = vfmv_f_s_f32m1_f32(temp6);
+        sum[7] = vfmv_f_s_f32m1_f32(temp7);
+
+        vfloat32m2_t s0 = vfadd_vv_f32m2(vle32_v_f32m2(sum, mvl), vle32_v_f32m2(bias + i, mvl), mvl);
+        vse32_v_f32m2(dst + i, s0, mvl);
+    }
+}
+
+enum { FASCONV_BASE_VECSZ = 4 }; // TODO: Large base size.
+void fastConv( const float* weights, size_t wstep, const float* bias,
+               const float* rowbuf, float* output, const int* outShape,
+               int blockSize, int vecsize, int vecsize_aligned,
+               const float* relu, bool initOutput )
+{
+    int vl = 4;
+    int outCn = outShape[1];
+    size_t outPlaneSize = outShape[2]*outShape[3];
+    float r0 = 1.f, r1 = 1.f, r2 = 1.f;
+    vfloat32m1_t vr0 = vfmv_v_f_f32m1(1, vl), vr1 = vfmv_v_f_f32m1(1, vl), vr2 = vfmv_v_f_f32m1(1, vl);
+    int maskbuf[FASCONV_BASE_VECSZ] = {0};
+    int rsz = blockSize % FASCONV_BASE_VECSZ;
+    for( int i = 0; i < rsz; i++ )
+        maskbuf[FASCONV_BASE_VECSZ - i - 1] = -1;
+    vint32m1_t vmaskbuf = vle32_v_i32m1(maskbuf ,vl);
+    vbool32_t mask = vmslt_vx_i32m1_b32(vmaskbuf, 0, vl); // mask for tail
+    // now compute dot product of the weights
+    // and im2row-transformed part of the tensor
+    for( int i = 0; i < outCn; i += 3 )
+    {
+        const float* wptr0 = weights + i*wstep;
+        const float* wptr1 = wptr0 + wstep;
+        const float* wptr2 = wptr1 + wstep;
+        float* outptr0 = output + i*outPlaneSize;
+        float* outptr1 = outptr0 + outPlaneSize;
+        float* outptr2 = outptr1 + outPlaneSize;
+        float bias0 = bias[i], bias1 = bias[i+1], bias2 = bias[i+2];
+
+        if( i+2 >= outCn )
+        {
+            wptr2 = wptr1;
+            outptr2 = outptr1;
+            bias2 = bias1;
+            if( i+1 >= outCn )
+            {
+                wptr2 = wptr1 = wptr0;
+                outptr2 = outptr1 = outptr0;
+                bias2 = bias1 = bias0;
+            }
+        }
+
+        if( relu )
+        {
+            r0 = relu[i]; r1 = relu[i+1]; r2 = relu[i+2];
+            if( i+2 >= outCn )
+            {
+                r2 = r1;
+                if( i+1 >= outCn )
+                    r2 = r1 = r0;
+            }
+            vr0 = vfmv_v_f_f32m1(r0, vl);
+            vr1 = vfmv_v_f_f32m1(r1, vl);
+            vr2 = vfmv_v_f_f32m1(r2, vl);
+        }
+
+        int j = 0;
+        for( ; j < blockSize; j += FASCONV_BASE_VECSZ )
+        {
+            bool tail = false;
+            if (j + FASCONV_BASE_VECSZ > blockSize)
+            {
+                if (j == 0) {
+                    vl = blockSize;
+                }
+                else {
+                    j = blockSize - FASCONV_BASE_VECSZ;
+                    tail = true;
+                }
+            }
+            int k = 0;
+            const float* rptr = rowbuf + j*vecsize_aligned;
+            int vlm2 = 8;
+            vfloat32m2_t vs00 = vfmv_v_f_f32m2(0, vlm2), vs01 = vfmv_v_f_f32m2(0, vlm2),
+                   vs02 = vfmv_v_f_f32m2(0, vlm2), vs03 = vfmv_v_f_f32m2(0, vlm2),
+                   vs10 = vfmv_v_f_f32m2(0, vlm2), vs11 = vfmv_v_f_f32m2(0, vlm2),
+                   vs12 = vfmv_v_f_f32m2(0, vlm2), vs13 = vfmv_v_f_f32m2(0, vlm2),
+                   vs20 = vfmv_v_f_f32m2(0, vlm2), vs21 = vfmv_v_f_f32m2(0, vlm2),
+                   vs22 = vfmv_v_f_f32m2(0, vlm2), vs23 = vfmv_v_f_f32m2(0, vlm2);
+
+            for (; k < vecsize; k += 8, rptr += 8 )
+            {
+                if (k+8 >= vecsize) {
+                    vlm2 = vecsize - k;
+                }
+                vfloat32m2_t w0 = vle32_v_f32m2(wptr0 + k, vlm2);
+                vfloat32m2_t w1 = vle32_v_f32m2(wptr1 + k, vlm2);
+                vfloat32m2_t w2 = vle32_v_f32m2(wptr2 + k, vlm2);
+                vfloat32m2_t r0 = vle32_v_f32m2(rptr, vlm2);
+
+                vs00 = vfmacc_vv_f32m2(vs00, w0, r0, vlm2);
+                vs10 = vfmacc_vv_f32m2(vs10, w1, r0, vlm2);
+                vs20 = vfmacc_vv_f32m2(vs20, w2, r0, vlm2);
+
+                r0 = vle32_v_f32m2(rptr + vecsize_aligned, vlm2);
+                vs01 = vfmacc_vv_f32m2(vs01, w0, r0, vlm2);
+                vs11 = vfmacc_vv_f32m2(vs11, w1, r0, vlm2);
+                vs21 = vfmacc_vv_f32m2(vs21, w2, r0, vlm2);
+
+                r0 = vle32_v_f32m2(rptr + vecsize_aligned*2, vlm2);
+                vs02 = vfmacc_vv_f32m2(vs02, w0, r0, vlm2);
+                vs12 = vfmacc_vv_f32m2(vs12, w1, r0, vlm2);
+                vs22 = vfmacc_vv_f32m2(vs22, w2, r0, vlm2);
+
+                r0 = vle32_v_f32m2(rptr + vecsize_aligned*3, vlm2);
+                vs03 = vfmacc_vv_f32m2(vs03, w0, r0, vlm2);
+                vs13 = vfmacc_vv_f32m2(vs13, w1, r0, vlm2);
+                vs23 = vfmacc_vv_f32m2(vs23, w2, r0, vlm2);
+            }
+            vfloat32m1_t s0, s1, s2;
+
+            if( initOutput )
+            {
+                s0 = vfmv_v_f_f32m1(bias0, vl);
+                s1 = vfmv_v_f_f32m1(bias1, vl);
+                s2 = vfmv_v_f_f32m1(bias2, vl);
+            }
+            else
+            {
+                s0 = vle32_v_f32m1(outptr0 + j, vl);
+                s1 = vle32_v_f32m1(outptr1 + j, vl);
+                s2 = vle32_v_f32m1(outptr2 + j, vl);
+            }
+            // compute sum of each vs
+            vfloat32m1_t zero = vfmv_v_f_f32m1(0, vl);
+            vfloat32m1_t temp00 = vfredsum_vs_f32m2_f32m1(temp00, vs00, zero, 8);
+            vfloat32m1_t temp01 = vfredsum_vs_f32m2_f32m1(temp01, vs01, zero, 8);
+            vfloat32m1_t temp02 = vfredsum_vs_f32m2_f32m1(temp02, vs02, zero, 8);
+            vfloat32m1_t temp03 = vfredsum_vs_f32m2_f32m1(temp03, vs03, zero, 8);
+            vfloat32m1_t temp10 = vfredsum_vs_f32m2_f32m1(temp10, vs10, zero, 8);
+            vfloat32m1_t temp11 = vfredsum_vs_f32m2_f32m1(temp11, vs11, zero, 8);
+            vfloat32m1_t temp12 = vfredsum_vs_f32m2_f32m1(temp12, vs12, zero, 8);
+            vfloat32m1_t temp13 = vfredsum_vs_f32m2_f32m1(temp13, vs13, zero, 8);
+            vfloat32m1_t temp20 = vfredsum_vs_f32m2_f32m1(temp20, vs20, zero, 8);
+            vfloat32m1_t temp21 = vfredsum_vs_f32m2_f32m1(temp21, vs21, zero, 8);
+            vfloat32m1_t temp22 = vfredsum_vs_f32m2_f32m1(temp22, vs22, zero, 8);
+            vfloat32m1_t temp23 = vfredsum_vs_f32m2_f32m1(temp23, vs23, zero, 8);
+            float32_t sum0[4], sum1[4], sum2[4];
+            sum0[0] = vfmv_f_s_f32m1_f32(temp00);
+            sum0[1] = vfmv_f_s_f32m1_f32(temp01);
+            sum0[2] = vfmv_f_s_f32m1_f32(temp02);
+            sum0[3] = vfmv_f_s_f32m1_f32(temp03);
+            sum1[0] = vfmv_f_s_f32m1_f32(temp10);
+            sum1[1] = vfmv_f_s_f32m1_f32(temp11);
+            sum1[2] = vfmv_f_s_f32m1_f32(temp12);
+            sum1[3] = vfmv_f_s_f32m1_f32(temp13);
+            sum2[0] = vfmv_f_s_f32m1_f32(temp20);
+            sum2[1] = vfmv_f_s_f32m1_f32(temp21);
+            sum2[2] = vfmv_f_s_f32m1_f32(temp22);
+            sum2[3] = vfmv_f_s_f32m1_f32(temp23);
+
+            s0 = vfadd_vv_f32m1(vle32_v_f32m1(sum0, vl), s0, vl);
+            s1 = vfadd_vv_f32m1(vle32_v_f32m1(sum1, vl), s1, vl);
+            s2 = vfadd_vv_f32m1(vle32_v_f32m1(sum2, vl), s2, vl);
+
+
+            if( relu )
+            {
+                vbool32_t m0 = vmfgt_vf_f32m1_b32(s0, 0, vl);
+                vbool32_t m1 = vmfgt_vf_f32m1_b32(s1, 0, vl);
+                vbool32_t m2 = vmfgt_vf_f32m1_b32(s2, 0, vl);
+                s0 = vmerge_vvm_f32m1(m0, vfmul_vv_f32m1(s0, vr0, vl), s0, vl);
+                s1 = vmerge_vvm_f32m1(m1, vfmul_vv_f32m1(s1, vr1, vl), s1, vl);
+                s2 = vmerge_vvm_f32m1(m2, vfmul_vv_f32m1(s2, vr2, vl), s2, vl);
+            }
+
+            if( tail )
+            {
+                s0 = vmerge_vvm_f32m1(mask, vle32_v_f32m1(outptr0 + j, vl), s0, vl);
+                s1 = vmerge_vvm_f32m1(mask, vle32_v_f32m1(outptr1 + j, vl), s1, vl);
+                s2 = vmerge_vvm_f32m1(mask, vle32_v_f32m1(outptr2 + j, vl), s2, vl);
+            }
+
+            vse32_v_f32m1(outptr0 + j, s0, vl);
+            vse32_v_f32m1(outptr1 + j, s1, vl);
+            vse32_v_f32m1(outptr2 + j, s2, vl);
+        }
+    }
+}
+
+/*
+Example for load_deinterleave:
+    input: ptr[16] = {1,2,3, ... ,14,15,16}
+    output: a = {1, 3, 5, 7, 9, 11, 13, 15}
+    output: b = {2, 4, 6, 8,10, 12, 14, 16}
+*/
+static inline void vfloat32m2_load_deinterleave(const float* ptr, vfloat32m2_t& a, vfloat32m2_t& b)
+{
+    int vl = 8;
+    uint32_t masks[] = {1,1,1,1,0,0,0,0};
+    vuint32m2_t vm = vle32_v_u32m2(masks,vl);
+    vbool16_t mask01 = vmseq_vx_u32m2_b16 (vm, 0, vl);
+    vbool16_t mask10 = vmseq_vx_u32m2_b16 (vm, 1, vl);
+    vfloat32m2_t ta = vle32_v_f32m2(ptr, vl), tb = vle32_v_f32m2(ptr+8, vl);
+    uint idx[] = {0,2,4,6,1,3,5,7};
+    uint idxa[] = {0,0,0,0,0,1,2,3}, idxb[] = {4,5,6,7,0,0,0,0};
+    vuint32m2_t vidxa = vle32_v_u32m2(idxa, 8), vidxb = vle32_v_u32m2(idxb, 8);
+    vuint32m2_t vidx = vle32_v_u32m2(idx, 8);
+    vfloat32m2_t high = vfmv_v_f_f32m2(0, 8), low = vfmv_v_f_f32m2(0, 8);
+    high = vrgather_vv_f32m2(ta, vidx, 8);
+    low = vrgather_vv_f32m2(tb, vidx, 8);
+    a = vrgather_vv_f32m2_m(mask01, high, low, vidxa, 8);
+    b = vrgather_vv_f32m2_m(mask10, low, high, vidxb, 8);
+}
+
+void fastDepthwiseConv( const float* wptr,
+                     int kernel_h, int kernel_w,
+                     int stride_h, int stride_w,
+                     int dilation_h, int dilation_w,
+                     int pad_t, int pad_l,
+                     const float* biasptr, const float* relu,
+                     const float* inptr_,
+                     int height, int width,
+                     float* outptr_,
+                     int out_d, int outH, int outW )
+{
+    int vl = 8;
+    const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
+                w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
+                w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
+    int outW1 = std::min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
+    float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];
+
+    for (int out_i = 0; out_i < outH; out_i++)
+    {
+        int in_i = out_i * stride_h - pad_t, out_j = 0;
+        const float* imgptr0 = inptr_ + in_i*width;
+        const float* imgptr1 = imgptr0 + dilation_h*width;
+        const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
+        float out, w00 = w00_, w01 = w01_, w02 = w02_;
+        float w20 = w20_, w21 = w21_, w22 = w22_;
+        if (in_i < 0)
+        {
+            w00 = w01 = w02 = 0.f;
+            imgptr0 = imgptr1;
+        }
+        else if (in_i + dilation_h*(kernel_h-1) >= height)
+        {
+            w20 = w21 = w22 = 0.f;
+            imgptr2 = imgptr1;
+        }
+        float* outptr = outptr_ + out_i*outW;
+        if (pad_l > 0)
+        {
+            out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
+                  imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
+                  imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[0] = out;
+            out_j = 1;
+        }
+
+        if (stride_w == 1 || (stride_w == 2 && dilation_w == 1))
+        {
+            const int VECSZ = 8;
+            vfloat32m2_t vw00 = vfmv_v_f_f32m2(w00, vl), vw01 = vfmv_v_f_f32m2(w01, vl), vw02 = vfmv_v_f_f32m2(w02, vl),
+                      vw10 = vfmv_v_f_f32m2(w10, vl), vw11 = vfmv_v_f_f32m2(w11, vl), vw12 = vfmv_v_f_f32m2(w12, vl),
+                      vw20 = vfmv_v_f_f32m2(w20, vl), vw21 = vfmv_v_f_f32m2(w21, vl), vw22 = vfmv_v_f_f32m2(w22, vl);
+            vfloat32m2_t vbias = vfmv_v_f_f32m2(bias, vl), vrc = vfmv_v_f_f32m2(relu_coeff, vl);
+
+            if( stride_w == 1 )
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    vfloat32m2_t v00 = vle32_v_f32m2(imgptr0 + in_j, vl),
+                           v01 = vle32_v_f32m2(imgptr0 + in_j + dilation_w, vl),
+                           v02 = vle32_v_f32m2(imgptr0 + in_j + dilation_w*2, vl),
+                           v10 = vle32_v_f32m2(imgptr1 + in_j, vl),
+                           v11 = vle32_v_f32m2(imgptr1 + in_j + dilation_w, vl),
+                           v12 = vle32_v_f32m2(imgptr1 + in_j + dilation_w*2, vl),
+                           v20 = vle32_v_f32m2(imgptr2 + in_j, vl),
+                           v21 = vle32_v_f32m2(imgptr2 + in_j + dilation_w, vl),
+                           v22 = vle32_v_f32m2(imgptr2 + in_j + dilation_w*2, vl);
+
+                    vfloat32m2_t vout0 = vfmacc_vv_f32m2(vbias, v00, vw00, vl);
+                    vfloat32m2_t vout1 = vfmul_vv_f32m2(v01, vw01, vl);
+                    vfloat32m2_t vout2 = vfmul_vv_f32m2(v02, vw02, vl);
+
+                    vout0 = vfmacc_vv_f32m2(vout0, v10, vw10, vl);
+                    vout1 = vfmacc_vv_f32m2(vout1, v11, vw11, vl);
+                    vout2 = vfmacc_vv_f32m2(vout2, v12, vw12, vl);
+
+                    vout0 = vfmacc_vv_f32m2(vout0, v20, vw20, vl);
+                    vout1 = vfmacc_vv_f32m2(vout1, v21, vw21, vl);
+                    vout2 = vfmacc_vv_f32m2(vout2, v22, vw22, vl);
+
+                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
+                    if (relu)
+                    {
+                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
+                        vout0 = vmerge_vvm_f32m2(m, vfmul_vv_f32m2(vout0, vrc, vl), vout0, vl);
+                    }
+                    vse32_v_f32m2(outptr + out_j, vout0, vl);
+                }
+            else
+                for( ; out_j < outW1; out_j += VECSZ )
+                {
+                    if (out_j + VECSZ > outW1 && out_j > pad_l)
+                        out_j = outW1 - VECSZ;
+                    int in_j = out_j * stride_w - pad_l;
+                    vfloat32m2_t v00, v01, v02, v10, v11, v12, v20, v21, v22, unused;
+                    vfloat32m2_load_deinterleave(imgptr0 + in_j, v00, v01);
+                    vfloat32m2_load_deinterleave(imgptr0 + in_j + 2, v02, unused);
+                    vfloat32m2_load_deinterleave(imgptr1 + in_j, v10, v11);
+                    vfloat32m2_load_deinterleave(imgptr1 + in_j + 2, v12, unused);
+                    vfloat32m2_load_deinterleave(imgptr2 + in_j, v20, v21);
+                    vfloat32m2_load_deinterleave(imgptr2 + in_j + 2, v22, unused);
+
+                    vfloat32m2_t vout0 = vfmacc_vv_f32m2(vbias, v00, vw00, vl);
+                    vfloat32m2_t vout1 = vfmul_vv_f32m2(v01, vw01, vl);
+                    vfloat32m2_t vout2 = vfmul_vv_f32m2(v02, vw02, vl);
+
+                    vout0 = vfmacc_vv_f32m2(vout0, v10, vw10, vl);
+                    vout1 = vfmacc_vv_f32m2(vout1, v11, vw11, vl);
+                    vout2 = vfmacc_vv_f32m2(vout2, v12, vw12, vl);
+
+                    vout0 = vfmacc_vv_f32m2(vout0, v20, vw20, vl);
+                    vout1 = vfmacc_vv_f32m2(vout1, v21, vw21, vl);
+                    vout2 = vfmacc_vv_f32m2(vout2, v22, vw22, vl);
+
+                    vout0 = vfadd_vv_f32m2(vfadd_vv_f32m2(vout0, vout1, vl), vout2, vl);
+                    if (relu)
+                    {
+                        vbool16_t m = vmfgt_vf_f32m2_b16(vout0, 0, vl);
+                        vout0 = vmerge_vvm_f32m2(m, vfmul_vv_f32m2(vout0, vrc, vl), vout0, vl);
+                    }
+                    vse32_v_f32m2(outptr + out_j, vout0, vl);
+                }
+        }
+
+        for (; out_j < outW1; out_j++)
+        {
+            int in_j = out_j * stride_w - pad_l;
+            out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
+                  imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
+                  imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+
+        for (; out_j < outW; out_j++ )
+        {
+            int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
+            float s0 = 1.f, s1 = 1.f, s2 = 1.f;
+            if (in_j0 >= width)
+            {
+                in_j0 = 0;
+                s0 = 0.f;
+            }
+            if (in_j1 >= width)
+            {
+                in_j1 = 0;
+                s1 = 0.f;
+            }
+            if (in_j2 >= width)
+            {
+                in_j2 = 0;
+                s2 = 0.f;
+            }
+            out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
+                  imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
+                  imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
+            if (relu)
+                out = out > 0.f ? out : out*relu_coeff;
+            outptr[out_j] = out;
+        }
+    }
+}
+
+#endif // CV_RVV
+
 CV_CPU_OPTIMIZATION_NAMESPACE_END
 }} // namespace
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index 24559543e19b..236f2e43f11e 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -338,7 +338,7 @@ class NormalizeBBoxLayerImpl CV_FINAL : public NormalizeBBoxLayer
             std::iota(axes_data.begin(), axes_data.end(), 1);
         }
         auto axes = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{axes_data.size()}, axes_data);
-        auto norm = std::make_shared<ngraph::op::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
+        auto norm = std::make_shared<ngraph::op::v0::NormalizeL2>(ieInpNode, axes, epsilon, ngraph::op::EpsMode::ADD);
 
         CV_Assert(blobs.empty() || numChannels == blobs[0].total());
         std::vector<size_t> shape(ieInpNode->get_shape().size(), 1);
diff --git a/modules/dnn/src/layers/not_implemented_layer.cpp b/modules/dnn/src/layers/not_implemented_layer.cpp
new file mode 100644
index 000000000000..c4b134390222
--- /dev/null
+++ b/modules/dnn/src/layers/not_implemented_layer.cpp
@@ -0,0 +1,194 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../precomp.hpp"
+#include "../dnn_common.hpp"
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+namespace detail {
+
+class NotImplementedImpl CV_FINAL : public NotImplemented
+{
+public:
+    NotImplementedImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(params.has("type"));
+        std::stringstream ss;
+        ss << "Node for layer '" << params.name << "' of type '" << params.get("type") << "' wasn't initialized.";
+        msg = ss.str();
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    virtual void finalize(const std::vector<Mat*> &input, std::vector<Mat> &output) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    void forward_fallback(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED_EXTERNAL
+    void finalize(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED std::vector<Mat> finalize(const std::vector<Mat> &inputs)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    CV_DEPRECATED void run(const std::vector<Mat> &inputs,
+                           CV_OUT std::vector<Mat> &outputs,
+                           CV_IN_OUT std::vector<Mat> &internals)
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int inputNameToIndex(String inputName) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int outputNameToIndex(const String& outputName) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool supportBackend(int backendId) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> initCUDA(
+            void *context,
+            const std::vector<Ptr<BackendWrapper>>& inputs,
+            const std::vector<Ptr<BackendWrapper>>& outputs
+    ) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void applyHalideScheduler(Ptr<BackendNode>& node,
+                                      const std::vector<Mat*> &inputs,
+                                      const std::vector<Mat> &outputs,
+                                      int targetId) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual void unsetAttached() CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                                 const int requiredOutputs,
+                                 std::vector<MatShape> &outputs,
+                                 std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
+                           const std::vector<MatShape> &outputs) const CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+    virtual bool updateMemoryShapes(const std::vector<MatShape> &inputs) CV_OVERRIDE
+    {
+        CV_Error(Error::StsNotImplemented, msg);
+    }
+
+private:
+    std::string msg;
+};
+
+Ptr<Layer> NotImplemented::create(const LayerParams& params)
+{
+    return makePtr<NotImplementedImpl>(params);
+}
+
+Ptr<Layer> notImplementedRegisterer(LayerParams &params)
+{
+    return detail::NotImplemented::create(params);
+}
+
+void NotImplemented::Register()
+{
+    LayerFactory::registerLayer("NotImplemented", detail::notImplementedRegisterer);
+}
+
+void NotImplemented::unRegister()
+{
+    LayerFactory::unregisterLayer("NotImplemented");
+}
+
+} // namespace detail
+
+CV__DNN_INLINE_NS_END
+}}  // namespace cv::dnn
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index d18256879580..c1979ce701ac 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -134,6 +134,8 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer
                 cv::convertFp16(paddingValue_fp32, paddingValue_fp16);
                 outputs[0].setTo(paddingValue_fp16[0]);
             }
+            else if (inputs_arr.depth() == CV_8S)
+                outputs[0].setTo(saturate_cast<int8_t>(paddingValue));
             else
                 outputs[0].setTo(paddingValue);
             inputs[0].copyTo(outputs[0](dstRanges));
@@ -264,6 +266,16 @@ class PaddingLayerImpl CV_FINAL : public PaddingLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float outputScale = scales[1][0];
+        int outputZp = zeropoints[1][0];
+        float padValue = outputZp + std::round(params.get<float>("value", 0)/outputScale);
+        params.set("value", padValue);
+        return true;
+    }
+
 private:
     std::vector<std::pair<int, int> > paddings;  // Pairs pad before, pad after.
     std::vector<Range> dstRanges;
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index f950e2cff3c4..9e66eb6a648f 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -196,6 +196,7 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
 #endif
     }
 
+    template <class T>
     class PermuteInvoker : public ParallelLoopBody
     {
     public:
@@ -231,7 +232,7 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
             size_t stripeStart = r.start*stripeSize;
             size_t stripeEnd = std::min(r.end*stripeSize, orows);
 
-            const size_t esz = sizeof(float);
+            const size_t esz = sizeof(T);
             size_t ostep0 = out->step[0]/esz, ostep1 = out->step[1]/esz, ostep2 = out->step[2]/esz;
             const size_t* ord = &order->at(0);
             size_t istep0 = inp->step[ord[0]]/esz, istep1 = inp->step[ord[1]]/esz,
@@ -243,13 +244,13 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
             int i1 = (int)(val % n1);
             int i0 = (int)(val / n1);
 
-            const float* inptr_orig = inp->ptr<float>();
-            float* outptr_orig = out->ptr<float>();
+            const T* inptr_orig = inp->ptr<T>();
+            T* outptr_orig = out->ptr<T>();
 
             for( size_t ofs = stripeStart; ofs < stripeEnd; ofs++ )
             {
-                const float* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
-                float* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
+                const T* inptr = inptr_orig + i0*istep0 + i1*istep1 + i2*istep2;
+                T* outptr = outptr_orig + i0*ostep0 + i1*ostep1 + i2*ostep2;
 
                 for( int i3 = 0; i3 < n3; i3++ )
                     outptr[i3] = inptr[i3*istep3];
@@ -323,7 +324,8 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
-        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
+        CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
+                   inputs_arr.depth() != CV_8S,
                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 
         if (inputs_arr.depth() == CV_16S)
@@ -367,24 +369,48 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
                 if( numAxes == 4 )
                 {
                     int nstripes = getNumThreads();
-                    PermuteInvoker::run(inp, out, _order, nstripes);
+                    if (inp.type() == CV_8S)
+                        PermuteInvoker<int8_t>::run(inp, out, _order, nstripes);
+                    else
+                        PermuteInvoker<float>::run(inp, out, _order, nstripes);
                 }
                 else
                 {
-                    const float *srcData = inp.ptr<float>();
-                    float *dstData = out.ptr<float>();
+                    if (inp.type() == CV_8S)
+                    {
+                        const int8_t *srcData = inp.ptr<int8_t>();
+                        int8_t *dstData = out.ptr<int8_t>();
 
-                    for (i = 0; i < count; ++i)
+                        for (i = 0; i < count; ++i)
+                        {
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
+                        }
+                    }
+                    else
                     {
-                        size_t oldPosition = 0;
-                        size_t newPosition = i;
+                        const float *srcData = inp.ptr<float>();
+                        float *dstData = out.ptr<float>();
 
-                        for (j = 0; j < numAxes; ++j)
+                        for (i = 0; i < count; ++i)
                         {
-                            oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
-                            newPosition %= newStride[j];
+                            size_t oldPosition = 0;
+                            size_t newPosition = i;
+
+                            for (j = 0; j < numAxes; ++j)
+                            {
+                                oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                                newPosition %= newStride[j];
+                            }
+                            dstData[i] = srcData[oldPosition];
                         }
-                        dstData[i] = srcData[oldPosition];
                     }
                 }
             }
@@ -452,6 +478,11 @@ class PermuteLayerImpl CV_FINAL : public PermuteLayer
     }
 #endif // HAVE_VULKAN
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     size_t _count;
     std::vector<size_t> _order;
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 3ac3fbce67c2..0b9b94fa5772 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -1413,6 +1413,23 @@ class PoolingLayerImpl CV_FINAL : public PoolingLayer
         return true;
     }
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        if (type == MAX && !computeMaxIdx)
+        {
+            return true;
+        }
+        else if (type == AVE || type == SUM)
+        {
+            float multiplier = scales[0][0] / scales[1][0];
+            params.set("multiplier", multiplier);
+            params.set("input_zeropoint", zeropoints[0][0]);
+            return true;
+        }
+        return false;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index bdc46643fca2..6bfa48adbf43 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -80,12 +80,31 @@ static void sigmoid(const Mat &src, Mat &dst)
     cv::pow(1 + dst, -1, dst);
 }
 
+typedef void (*ActivationFunction)(const Mat &src, Mat &dst);
+static ActivationFunction get_activation_function(const String& activation) {
+    // most used activations for PyTorch and TF : Tanh, Sigmoid
+    // if you need to support more optional activations use std::map instead
+    if (activation == "Tanh")
+    {
+        return tanh;
+    }
+    else if (activation == "Sigmoid")
+    {
+        return sigmoid;
+    }
+    else
+    {
+        CV_Error(Error::StsNotImplemented,
+                 cv::format("Activation function [%s] for layer LSTM  is not supported", activation.c_str()));
+    }
+}
+
 class LSTMLayerImpl CV_FINAL : public LSTMLayer
 {
     int numTimeStamps, numSamples;
     bool allocated;
 
-    MatShape outTailShape;                 //shape of single output sample
+    MatShape outTailShape;  //shape of single output sample
     MatShape outTsShape;    //shape of N output samples
 
     bool useTimestampDim;
@@ -95,6 +114,10 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
     bool reverse;   // If true, go in negative direction along the time axis
     bool bidirectional;  // If true, produces both forward and reversed directions along time axis
 
+    ActivationFunction f_activation;
+    ActivationFunction g_activation;
+    ActivationFunction h_activation;
+
 public:
 
     LSTMLayerImpl(const LayerParams& params)
@@ -112,19 +135,24 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
             const Mat& Wh = blobs[0];
             const Mat& Wx = blobs[1];
             const Mat& bias = blobs[2];
+            const Mat& hInternal = blobs[3];
+            const Mat& cInternal = blobs[4];
             CV_CheckEQ(Wh.dims, 2, "");
             CV_CheckEQ(Wx.dims, 2, "");
             CV_CheckEQ(Wh.rows, Wx.rows, "");
             CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional))*4*Wh.cols, "");
             CV_CheckEQ(Wh.rows, (int)bias.total(), "");
+            CV_CheckEQ(hInternal.cols, Wh.cols, "");
+            CV_CheckEQ(hInternal.cols, cInternal.cols, "");
+            CV_CheckEQ(hInternal.rows, cInternal.rows, "");
             CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
             // Peephole weights.
-            if (blobs.size() > 3)
+            if (blobs.size() > 5)
             {
-                CV_Assert(blobs.size() == 6);
+                CV_Assert(blobs.size() == 8);
                 const int N = Wh.cols;
-                for (int i = 3; i < 6; ++i)
+                for (int i = 5; i < 8; ++i)
                 {
                     CV_Assert(blobs[i].rows == N && blobs[i].cols == N);
                     CV_Assert(blobs[i].type() == bias.type());
@@ -140,6 +168,20 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         reverse = params.get<bool>("reverse", false);
         CV_Assert(!reverse || !bidirectional);
 
+        // read activations
+        DictValue activations = params.get<DictValue>("activations", "");
+        if (activations.size() == 1) // if activations wasn't specified use default
+        {
+            f_activation = sigmoid;
+            g_activation = tanh;
+            h_activation = tanh;
+        } else {
+            CV_Assert(activations.size() == 3);
+            f_activation = get_activation_function(activations.getStringValue(0));
+            g_activation = get_activation_function(activations.getStringValue(1));
+            h_activation = get_activation_function(activations.getStringValue(2));
+        }
+
         allocated = false;
         outTailShape.clear();
     }
@@ -181,7 +223,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
                          std::vector<MatShape> &outputs,
                          std::vector<MatShape> &internals) const CV_OVERRIDE
     {
-        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
         CV_Assert(inputs.size() == 1);
         const MatShape& inp0 = inputs[0];
 
@@ -228,7 +270,7 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
         std::vector<Mat> input;
         inputs_arr.getMatVector(input);
 
-        CV_Assert((!usePeephole && blobs.size() == 3) || (usePeephole && blobs.size() == 6));
+        CV_Assert((!usePeephole && blobs.size() == 5) || (usePeephole && blobs.size() == 8));
         CV_Assert(input.size() == 1);
         const Mat& inp0 = input[0];
 
@@ -284,13 +326,14 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
             const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
             const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
             const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+            const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs);
+            const Mat &c_0 = blobs[4].rowRange(i * blobs[4].rows / numDirs, (i + 1) * blobs[4].rows / numDirs);
 
             int numOut = Wh.size[1];
-
             Mat hInternal = internals[0], cInternal = internals[1],
                     dummyOnes = internals[2], gates = internals[3];
-            hInternal.setTo(0.);
-            cInternal.setTo(0.);
+            h_0.copyTo(hInternal);
+            c_0.copyTo(cInternal);
             dummyOnes.setTo(1.);
 
             int numSamplesTotal = numTimeStamps*numSamples;
@@ -331,17 +374,17 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
                 if (usePeephole)
                 {
                     Mat gatesIF = gates.colRange(0, 2*numOut);
-                    gemm(cInternal, blobs[3], 1, gateI, 1, gateI);
-                    gemm(cInternal, blobs[4], 1, gateF, 1, gateF);
-                    sigmoid(gatesIF, gatesIF);
+                    gemm(cInternal, blobs[5], 1, gateI, 1, gateI);
+                    gemm(cInternal, blobs[6], 1, gateF, 1, gateF);
+                    f_activation(gatesIF, gatesIF);
                 }
                 else
                 {
                     Mat gatesIFO = gates.colRange(0, 3*numOut);
-                    sigmoid(gatesIFO, gatesIFO);
+                    f_activation(gatesIFO, gatesIFO);
                 }
 
-                tanh(gateG, gateG);
+                g_activation(gateG, gateG);
 
                 //compute c_t
                 multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
@@ -355,12 +398,12 @@ class LSTMLayerImpl CV_FINAL : public LSTMLayer
                 }
                 if (usePeephole)
                 {
-                    gemm(cInternal, blobs[5], 1, gateO, 1, gateO);
-                    sigmoid(gateO, gateO);
+                    gemm(cInternal, blobs[7], 1, gateO, 1, gateO);
+                    f_activation(gateO, gateO);
                 }
 
                 //compute h_t
-                tanh(cInternal, hInternal);
+                h_activation(cInternal, hInternal);
                 multiply(gateO, hInternal, hInternal);
 
                 //save results in output blobs
@@ -557,5 +600,214 @@ CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create(const LayerParams& params)
     return Ptr<RNNLayer>(new RNNLayerImpl(params));
 }
 
+class GRULayerImpl CV_FINAL : public GRULayer
+{
+    int numTimeStamps, numSamples;
+    bool allocated;
+
+    MatShape outTailShape;  //shape of single output sample
+    MatShape outTsShape;    //shape of N output samples
+    bool bidirectional;     // If true, produces both forward and reversed directions along time axis
+
+public:
+
+    GRULayerImpl(const LayerParams& params) : numTimeStamps(0), numSamples(0)
+    {
+        setParamsFrom(params);
+
+        bidirectional = params.get<bool>("bidirectional", false);
+        if (!blobs.empty())
+        {
+            CV_Assert(blobs.size() >= 3);
+
+            blobs[2] = blobs[2].reshape(1, 1);
+
+            const Mat& Wh = blobs[0];
+            const Mat& Wx = blobs[1];
+            const Mat& bias = blobs[2];
+            const Mat& hInternal = blobs[3];
+            CV_CheckEQ(Wh.dims, 2, "");
+            CV_CheckEQ(Wx.dims, 2, "");
+            CV_CheckEQ(Wh.rows, Wx.rows, "");
+            CV_CheckEQ(Wh.rows, (1 + static_cast<int>(bidirectional)) * 3 * Wh.cols, "");
+            CV_CheckEQ(Wh.rows * 2, (int)bias.total(), "");
+            CV_CheckEQ(hInternal.cols, Wh.cols, "");
+            CV_CheckTypeEQ(Wh.type(), Wx.type(), "");
+            CV_CheckTypeEQ(Wx.type(), bias.type(), "");
+        }
+
+        allocated = false;
+        outTailShape.clear();
+    }
+
+    bool getMemoryShapes(const std::vector<MatShape> &inputs,
+                         const int requiredOutputs,
+                         std::vector<MatShape> &outputs,
+                         std::vector<MatShape> &internals) const CV_OVERRIDE
+    {
+        CV_Assert(inputs.size() == 1);
+        const MatShape& inp0 = inputs[0];
+
+        const Mat &Wh = blobs[0], &Wx = blobs[1];
+        int _numOut = Wh.size[1];
+        int _numInp = Wx.size[1];
+        MatShape outTailShape_(outTailShape), outResShape;
+
+        if (!outTailShape_.empty())
+            CV_Assert(total(outTailShape_) == _numOut);
+        else
+            outTailShape_.assign(1, _numOut);
+
+        int _numSamples;
+        CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp);
+        _numSamples = inp0[1];
+        outResShape.push_back(inp0[0]);
+
+        outResShape.push_back(_numSamples);
+        outResShape.insert(outResShape.end(), outTailShape_.begin(), outTailShape_.end());
+        outResShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        outputs.assign(1, outResShape);
+
+        internals.assign(1, shape(_numSamples, _numOut));     // hInternal
+        internals.push_back(shape(_numSamples, 1));           // dummyOnes
+        internals.push_back(shape(_numSamples, 2 * _numOut)); // gates
+        internals.push_back(shape(_numSamples, 2 * _numOut)); // gates_b
+        internals.push_back(shape(_numSamples, 1 * _numOut)); // h_linear
+        internals.push_back(shape(_numSamples, _numOut));     // ones
+
+        return false;
+    }
+
+    void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+    {
+        std::vector<Mat> input;
+        inputs_arr.getMatVector(input);
+
+        CV_Assert(input.size() == 1);
+        const Mat& inp0 = input[0];
+
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        int numOut = Wh.size[1];
+        int numInp = Wx.size[1];
+
+        if (!outTailShape.empty())
+            CV_Assert(total(outTailShape) == numOut);
+        else
+            outTailShape.assign(1, numOut);
+
+        CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+        numTimeStamps = inp0.size[0];
+        numSamples = inp0.size[1];
+
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+        outTsShape.back() *= (1 + static_cast<int>(bidirectional));
+
+        allocated = true;
+    }
+
+    void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+    {
+        CV_TRACE_FUNCTION();
+        CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+        if (inputs_arr.depth() == CV_16S)
+        {
+            forward_fallback(inputs_arr, outputs_arr, internals_arr);
+            return;
+        }
+
+        std::vector<Mat> input, output, internals;
+        inputs_arr.getMatVector(input);
+        outputs_arr.getMatVector(output);
+        internals_arr.getMatVector(internals);
+
+        const int numDirs = 1 + static_cast<int>(bidirectional);
+        for (int i = 0; i < numDirs; ++i)
+        {
+            const Mat &Wh = blobs[0].rowRange(i * blobs[0].rows / numDirs, (i + 1) * blobs[0].rows / numDirs);
+            const Mat &Wx = blobs[1].rowRange(i * blobs[1].rows / numDirs, (i + 1) * blobs[1].rows / numDirs);
+            const Mat &bias = blobs[2].colRange(i * blobs[2].cols / numDirs, (i + 1) * blobs[2].cols / numDirs);
+            const Mat &h_0 = blobs[3].rowRange(i * blobs[3].rows / numDirs, (i + 1) * blobs[3].rows / numDirs);
+
+            const Mat &bx = bias.colRange(0, bias.cols / 2);
+            const Mat &bh = bias.colRange(bias.cols / 2, bias.cols);
+
+            Mat hInternal = internals[0], dummyOnes = internals[1], gates = internals[2],
+                b_rz = internals[3], n_t = internals[4], ones = internals[5];
+            h_0.copyTo(hInternal);
+            dummyOnes.setTo(1.);
+            ones.setTo(1.);
+
+            int numOut = Wh.size[1];
+            const Mat& wx_rz = Wx.rowRange(0, 2 * numOut);
+            const Mat& wh_rz = Wh.rowRange(0, 2 * numOut);
+            b_rz = bx.colRange(0, 2 * numOut) + bh.colRange(0, 2 * numOut);
+            const Mat& wx_n = Wx.rowRange(2 * numOut, 3 * numOut);
+            const Mat& wh_n = Wh.rowRange(2 * numOut, 3 * numOut);
+            const Mat& b_in = bx.colRange(2 * numOut, 3 * numOut);
+            const Mat& b_hn = bh.colRange(2 * numOut, 3 * numOut);
+
+            int numSamplesTotal = numTimeStamps * numSamples;
+            Mat xTs = input[0].reshape(1, numSamplesTotal);
+
+            Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+            hOutTs = hOutTs.colRange(i * hOutTs.cols / numDirs, (i + 1) * hOutTs.cols / numDirs);
+            Mat cOutTs = Mat();
+
+            int tsStart, tsEnd, tsInc;
+            if (i == 1) {
+                tsStart = numTimeStamps - 1;
+                tsEnd = -1;
+                tsInc = -1;
+            }
+            else {
+                tsStart = 0;
+                tsEnd = numTimeStamps;
+                tsInc = 1;
+            }
+            for (int ts = tsStart; ts != tsEnd; ts += tsInc)
+            {
+                Range curRowRange(ts * numSamples, (ts + 1) * numSamples);
+                Mat xCurr = xTs.rowRange(curRowRange);
+
+                // calculate r_t = sigmoid(x * Wx_r + h_(t-1) * Wh_r + b_r)
+                // calculate z_t = sigmoid(x * Wx_z + h_(t-1) * Wh_z + b_z)
+                gemm(xCurr, wx_rz, 1, gates, 0, gates, GEMM_2_T);      // x * Wx_rz
+                gemm(hInternal, wh_rz, 1, gates, 1, gates, GEMM_2_T);  // + h_(t-1) * Wh_rz
+                gemm(dummyOnes, b_rz, 1, gates, 1, gates);             // + b_rz
+                sigmoid(gates, gates);                                 // sigmoid()
+
+                Mat z = gates.colRange(0, gates.cols / 2);
+                Mat r = gates.colRange(gates.cols / 2, gates.cols);
+
+                // calculate n_t = tanh(r (*) (h_(t-1) * Wh_n + b_hn) + x * Wx_n + b_in)
+                gemm(hInternal, wh_n, 1, n_t, 0, n_t, GEMM_2_T);       // h_(t-1) * Wh_n
+                gemm(dummyOnes, b_hn, 1, n_t, 1, n_t);                 // + b_hn
+                multiply(r, n_t, n_t);                                 // r (*) (h_(t-1) * Wh_n + b_hn)
+
+                gemm(xCurr, wx_n, 1, n_t, 1, n_t, GEMM_2_T);          // + x * Wx_n
+                gemm(dummyOnes, b_in, 1, n_t, 1, n_t);                // + b_in
+                tanh(n_t, n_t);                                       // tanh()
+
+                //compute next h_t = z (*) h_(t-1) + (1 - z) (*) n_t
+                multiply(z, hInternal, hInternal);                    // z (*) h_{t-1}
+                subtract(ones, z, z);                                 // 1 - z
+                multiply(z, n_t, z);                                  // (1 - z) * n
+                add(z, hInternal, hInternal);                         // z (*) h_(t-1) + (1 - z) (*) n_t
+
+                //save results in output blobs
+                hInternal.copyTo(hOutTs.rowRange(curRowRange));
+            }
+        }
+    }
+};
+
+Ptr<GRULayer> GRULayer::create(const LayerParams &params) {
+    return Ptr<GRULayer>(new GRULayerImpl(params));
+}
+
 }
 }
diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp
index da1c61adac0e..797df4819d9e 100644
--- a/modules/dnn/src/layers/reorg_layer.cpp
+++ b/modules/dnn/src/layers/reorg_layer.cpp
@@ -231,6 +231,11 @@ class ReorgLayerImpl CV_FINAL : public ReorgLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index 7c1829d4298e..0ba3abf04758 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -356,6 +356,11 @@ class ReshapeLayerImpl CV_FINAL : public ReshapeLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
 
 private:
     std::vector<MatShape> outShapes;
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index a5c268214e86..001db24a2df8 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -344,6 +344,14 @@ class ScaleLayerImpl CV_FINAL : public ScaleLayer
         shift = (hasBias && !blobs.empty()) ? blobs.back() : Mat();
     }
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+        params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+        return true;
+    }
+
     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/shuffle_channel_layer.cpp b/modules/dnn/src/layers/shuffle_channel_layer.cpp
index 6db74d1abda7..2a698d270fa8 100644
--- a/modules/dnn/src/layers/shuffle_channel_layer.cpp
+++ b/modules/dnn/src/layers/shuffle_channel_layer.cpp
@@ -147,6 +147,12 @@ class ShuffleChannelLayerImpl CV_FINAL : public ShuffleChannelLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        return true;
+    }
+
 private:
     Ptr<PermuteLayer> permute;
     std::vector<int> permuteInpShape, permuteOutShape;
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 54e234038710..9efd95cf48df 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -531,7 +531,12 @@ class SliceLayerImpl : public SliceLayer
             {
                 std::vector<int> inpIdx(dimsNum, 0);
                 std::vector<int> outIdx(dimsNum, 0);
-                getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                if (inpMat.type() == CV_16S)
+                    getSliceRecursive<int16_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else if (inpMat.type() == CV_8S)
+                    getSliceRecursive<int8_t>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
+                else
+                    getSliceRecursive<float>(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx);
             }
         }
     }
@@ -647,8 +652,20 @@ class SliceLayerImpl : public SliceLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
 
 private:
+    template <typename T>
     void getSliceRecursive(const Mat &inpMat, std::vector<int> &inpIdx,
                            const std::vector<Range> &sliceRanges,
                            const std::vector<int> &sliceSteps, int dim, int dimsNum,
@@ -658,8 +675,6 @@ class SliceLayerImpl : public SliceLayer
         int end = sliceRanges[dim].end;
         int step = !sliceSteps.empty() ? sliceSteps[dim] : 1;
 
-        const bool is32F = inpMat.depth() == CV_32F;
-
         // TODO optimization is required (for 2D tail case at least)
         for (int k = begin, j = 0; k < end; k += step, j++)
         {
@@ -667,14 +682,9 @@ class SliceLayerImpl : public SliceLayer
             outIdx[dim] = j;
 
             if (dim + 1 < dimsNum)
-                getSliceRecursive(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
+                getSliceRecursive<T>(inpMat, inpIdx, sliceRanges, sliceSteps, dim + 1, dimsNum, outputs, outIdx);
             else
-            {
-                if (is32F)
-                    outputs.at<float>(outIdx.data()) = inpMat.at<float>(inpIdx.data());
-                else
-                    outputs.at<short>(outIdx.data()) = inpMat.at<short>(inpIdx.data());  // 16F emulation
-            }
+                outputs.at<T>(outIdx.data()) = inpMat.at<T>(inpIdx.data());
         }
     }
 
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 0f67eb39b674..fdab6d3114f8 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -398,6 +398,22 @@ class SoftMaxLayerImpl CV_FINAL : public SoftmaxLayer
 
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        float inpScale = scales[0][0];
+        Mat lookUpTable(1, 256, CV_32F);
+        float* table = lookUpTable.ptr<float>();
+        for (int i = -128; i < 128; i++)
+        {
+            float x = inpScale*(i - 127); // ensures exp(x) is always between (0, 1)
+            table[i+128] = std::exp(x);
+        }
+        params.blobs.clear();
+        params.blobs.push_back(lookUpTable);
+        return true;
+    }
+
     int64 getFLOPS(const std::vector<MatShape> &inputs,
                   const std::vector<MatShape> &outputs) const CV_OVERRIDE
     {
diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp
index b025d5ff1e49..2a4417615264 100644
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -117,6 +117,17 @@ class SplitLayerImpl CV_FINAL : public SplitLayer
     }
 #endif
 
+    virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
+                             const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
+    {
+        const int numOutputs = scales[1].size();
+        for (int i = 0; i < numOutputs; i++)
+        {
+            if (scales[1][i] != scales[0][0])
+             return false;
+        }
+        return true;
+    }
 };
 
 Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
diff --git a/modules/dnn/src/math_utils.hpp b/modules/dnn/src/math_utils.hpp
new file mode 100644
index 000000000000..19ee474c7365
--- /dev/null
+++ b/modules/dnn/src/math_utils.hpp
@@ -0,0 +1,83 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Code is borrowed from https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-math.h
+
+// base/kaldi-math.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Yanmin Qian;
+//                      Jan Silovsky;  Saarland University
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __OPENCV_DNN_MATH_UTILS_HPP__
+#define __OPENCV_DNN_MATH_UTILS_HPP__
+
+#ifdef OS_QNX
+#include <math.h>
+#else
+#include <cmath>
+#endif
+
+#include <limits>
+
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209290e-7f
+#endif
+
+namespace cv { namespace dnn {
+
+const float kNegativeInfinity = -std::numeric_limits<float>::infinity();
+
+const float kMinLogDiffFloat = std::log(FLT_EPSILON);
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1700)
+inline float Log1p(float x) {  return log1pf(x); }
+#else
+inline float Log1p(float x) {
+  const float cutoff = 1.0e-07;
+  if (x < cutoff)
+    return x - 2 * x * x;
+  else
+    return Log(1.0 + x);
+}
+#endif
+
+inline float Exp(float x) { return expf(x); }
+
+inline float LogAdd(float x, float y) {
+  float diff;
+  if (x < y) {
+    diff = x - y;
+    x = y;
+  } else {
+    diff = y - x;
+  }
+  // diff is negative.  x is now the larger one.
+
+  if (diff >= kMinLogDiffFloat) {
+    float res;
+    res = x + Log1p(Exp(diff));
+    return res;
+  } else {
+    return x;  // return the larger one.
+  }
+}
+
+}}  // namespace
+
+#endif  // __OPENCV_DNN_MATH_UTILS_HPP__
diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
index 0af8223a7feb..bc8709d22edc 100644
--- a/modules/dnn/src/model.cpp
+++ b/modules/dnn/src/model.cpp
@@ -3,8 +3,10 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "precomp.hpp"
+#include "math_utils.hpp"
 #include <algorithm>
 #include <utility>
+#include <unordered_map>
 #include <iterator>
 
 #include <opencv2/imgproc.hpp>
@@ -552,6 +554,9 @@ struct TextRecognitionModel_Impl : public Model::Impl
     std::string decodeType;
     std::vector<std::string> vocabulary;
 
+    int beamSize = 10;
+    int vocPruneSize = 0;
+
     TextRecognitionModel_Impl()
     {
         CV_TRACE_FUNCTION();
@@ -575,6 +580,13 @@ struct TextRecognitionModel_Impl : public Model::Impl
         decodeType = type;
     }
 
+    inline
+    void setDecodeOptsCTCPrefixBeamSearch(int beam, int vocPrune)
+    {
+        beamSize = beam;
+        vocPruneSize = vocPrune;
+    }
+
     virtual
     std::string decode(const Mat& prediction)
     {
@@ -586,53 +598,213 @@ struct TextRecognitionModel_Impl : public Model::Impl
             CV_Error(Error::StsBadArg, "TextRecognitionModel: vocabulary is not specified");
 
         std::string decodeSeq;
-        if (decodeType == "CTC-greedy")
+        if (decodeType == "CTC-greedy") {
+            decodeSeq = ctcGreedyDecode(prediction);
+        } else if (decodeType == "CTC-prefix-beam-search") {
+            decodeSeq = ctcPrefixBeamSearchDecode(prediction);
+        } else if (decodeType.length() == 0) {
+            CV_Error(Error::StsBadArg, "Please set decodeType");
+        } else {
+            CV_Error_(Error::StsBadArg, ("Unsupported decodeType: %s", decodeType.c_str()));
+        }
+
+        return decodeSeq;
+    }
+
+    virtual
+    std::string ctcGreedyDecode(const Mat& prediction)
+    {
+        std::string decodeSeq;
+        CV_CheckEQ(prediction.dims, 3, "");
+        CV_CheckType(prediction.type(), CV_32FC1, "");
+        const int vocLength = (int)(vocabulary.size());
+        CV_CheckLE(prediction.size[1], vocLength, "");
+        bool ctcFlag = true;
+        int lastLoc = 0;
+        for (int i = 0; i < prediction.size[0]; i++)
         {
-            CV_CheckEQ(prediction.dims, 3, "");
-            CV_CheckType(prediction.type(), CV_32FC1, "");
-            const int vocLength = (int)(vocabulary.size());
-            CV_CheckLE(prediction.size[1], vocLength, "");
-            bool ctcFlag = true;
-            int lastLoc = 0;
-            for (int i = 0; i < prediction.size[0]; i++)
+            const float* pred = prediction.ptr<float>(i);
+            int maxLoc = 0;
+            float maxScore = pred[0];
+            for (int j = 1; j < vocLength + 1; j++)
             {
-                const float* pred = prediction.ptr<float>(i);
-                int maxLoc = 0;
-                float maxScore = pred[0];
-                for (int j = 1; j < vocLength + 1; j++)
+                float score = pred[j];
+                if (maxScore < score)
                 {
-                    float score = pred[j];
-                    if (maxScore < score)
-                    {
-                        maxScore = score;
-                        maxLoc = j;
-                    }
+                    maxScore = score;
+                    maxLoc = j;
                 }
+            }
 
-                if (maxLoc > 0)
-                {
-                    std::string currentChar = vocabulary.at(maxLoc - 1);
-                    if (maxLoc != lastLoc || ctcFlag)
-                    {
-                        lastLoc = maxLoc;
-                        decodeSeq += currentChar;
-                        ctcFlag = false;
-                    }
-                }
-                else
+            if (maxLoc > 0)
+            {
+                std::string currentChar = vocabulary.at(maxLoc - 1);
+                if (maxLoc != lastLoc || ctcFlag)
                 {
-                    ctcFlag = true;
+                    lastLoc = maxLoc;
+                    decodeSeq += currentChar;
+                    ctcFlag = false;
                 }
             }
-        } else if (decodeType.length() == 0) {
-            CV_Error(Error::StsBadArg, "Please set decodeType");
-        } else {
-            CV_Error_(Error::StsBadArg, ("Unsupported decodeType: %s", decodeType.c_str()));
+            else
+            {
+                ctcFlag = true;
+            }
         }
-
         return decodeSeq;
     }
 
+    struct PrefixScore
+    {
+        // blank ending score
+        float pB;
+        // none blank ending score
+        float pNB;
+
+        PrefixScore() : pB(kNegativeInfinity), pNB(kNegativeInfinity)
+        {
+
+        }
+        PrefixScore(float pB, float pNB) : pB(pB), pNB(pNB)
+        {
+
+        }
+    };
+
+    struct PrefixHash
+    {
+        size_t operator()(const std::vector<int>& prefix) const
+        {
+              // BKDR hash
+              unsigned int seed = 131;
+              size_t hash = 0;
+              for (size_t i = 0; i < prefix.size(); i++)
+              {
+                  hash = hash * seed + prefix[i];
+              }
+              return hash;
+        }
+    };
+
+    static
+    std::vector<std::pair<float, int>> TopK(
+                      const float* predictions, int length, int k)
+    {
+        std::vector<std::pair<float, int>> results;
+        // No prune.
+        if (k <= 0)
+        {
+            for (int i = 0; i < length; ++i)
+            {
+                results.emplace_back(predictions[i], i);
+            }
+            return results;
+        }
+
+        for (int i = 0; i < k; ++i)
+        {
+            results.emplace_back(predictions[i], i);
+        }
+        std::make_heap(results.begin(), results.end(), std::greater<std::pair<float, int>>{});
+
+        for (int i = k; i < length; ++i)
+        {
+            if (predictions[i] > results.front().first)
+            {
+                std::pop_heap(results.begin(), results.end(), std::greater<std::pair<float, int>>{});
+                results.pop_back();
+                results.emplace_back(predictions[i], i);
+                std::push_heap(results.begin(), results.end(), std::greater<std::pair<float, int>>{});
+            }
+        }
+        return results;
+    }
+
+    static inline
+    bool PrefixScoreCompare(
+            const std::pair<std::vector<int>, PrefixScore>& a,
+            const std::pair<std::vector<int>, PrefixScore>& b)
+    {
+            float probA = LogAdd(a.second.pB, a.second.pNB);
+            float probB = LogAdd(b.second.pB, b.second.pNB);
+            return probA > probB;
+    }
+
+    virtual
+    std::string ctcPrefixBeamSearchDecode(const Mat& prediction) {
+          // CTC prefix beam seach decode.
+          // For more detail, refer to:
+          // https://distill.pub/2017/ctc/#inference
+          // https://gist.github.com/awni/56369a90d03953e370f3964c826ed4b0i
+          using Beam = std::vector<std::pair<std::vector<int>, PrefixScore>>;
+          using BeamInDict = std::unordered_map<std::vector<int>, PrefixScore, PrefixHash>;
+
+          CV_CheckType(prediction.type(), CV_32FC1, "");
+          CV_CheckEQ(prediction.dims, 3, "");
+          CV_CheckEQ(prediction.size[1], 1, "");
+          CV_CheckEQ(prediction.size[2], (int)vocabulary.size() + 1, "");  // Length add 1 for ctc blank
+
+          std::string decodeSeq;
+          Beam beam = {std::make_pair(std::vector<int>(), PrefixScore(0.0, kNegativeInfinity))};
+          for (int i = 0; i < prediction.size[0]; i++)
+          {
+              // Loop over time
+              BeamInDict nextBeam;
+              const float* pred = prediction.ptr<float>(i);
+              std::vector<std::pair<float, int>> topkPreds =
+                  TopK(pred, vocabulary.size() + 1, vocPruneSize);
+              for (const auto& each : topkPreds)
+              {
+                  // Loop over vocabulary
+                  float prob = each.first;
+                  int token = each.second;
+                  for (const auto& it : beam)
+                  {
+                      const std::vector<int>& prefix = it.first;
+                      const PrefixScore& prefixScore = it.second;
+                      if (token == 0)  // 0 stands for ctc blank
+                      {
+                          PrefixScore& nextScore = nextBeam[prefix];
+                          nextScore.pB = LogAdd(nextScore.pB,
+                              LogAdd(prefixScore.pB + prob, prefixScore.pNB + prob));
+                          continue;
+                      }
+
+                      std::vector<int> nPrefix(prefix);
+                      nPrefix.push_back(token);
+                      PrefixScore& nextScore = nextBeam[nPrefix];
+                      if (prefix.size() > 0 && token == prefix.back())
+                      {
+                          nextScore.pNB = LogAdd(nextScore.pNB, prefixScore.pB + prob);
+                          PrefixScore& mScore = nextBeam[prefix];
+                          mScore.pNB = LogAdd(mScore.pNB, prefixScore.pNB + prob);
+                      }
+                      else
+                      {
+                          nextScore.pNB = LogAdd(nextScore.pNB,
+                              LogAdd(prefixScore.pB + prob, prefixScore.pNB + prob));
+                      }
+                  }
+              }
+              // Beam prune
+              Beam newBeam(nextBeam.begin(), nextBeam.end());
+              int newBeamSize = std::min(static_cast<int>(newBeam.size()), beamSize);
+              std::nth_element(newBeam.begin(), newBeam.begin() + newBeamSize,
+                     newBeam.end(), PrefixScoreCompare);
+              newBeam.resize(newBeamSize);
+              std::sort(newBeam.begin(), newBeam.end(), PrefixScoreCompare);
+              beam = std::move(newBeam);
+          }
+
+          CV_Assert(!beam.empty());
+          for (int token : beam[0].first)
+          {
+              CV_Check(token, token > 0 && token <= vocabulary.size(), "");
+              decodeSeq += vocabulary.at(token - 1);
+          }
+          return decodeSeq;
+    }
+
     virtual
     std::string recognize(InputArray frame)
     {
@@ -698,6 +870,12 @@ const std::string& TextRecognitionModel::getDecodeType() const
     return TextRecognitionModel_Impl::from(impl).decodeType;
 }
 
+TextRecognitionModel& TextRecognitionModel::setDecodeOptsCTCPrefixBeamSearch(int beamSize, int vocPruneSize)
+{
+    TextRecognitionModel_Impl::from(impl).setDecodeOptsCTCPrefixBeamSearch(beamSize, vocPruneSize);
+    return *this;
+}
+
 TextRecognitionModel& TextRecognitionModel::setVocabulary(const std::vector<std::string>& inputVoc)
 {
     TextRecognitionModel_Impl::from(impl).setVocabulary(inputVoc);
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 059fc8f402af..cff2bdc0f401 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -1257,8 +1257,11 @@ bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
     else if (config->tested)
         return false;
 
-    int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
-    top.zeros(4, sz, (use_half_) ? CV_16SC1 : CV_32FC1);
+    //int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
+    CV_CheckEQ(top.total(), (size_t)numImages * num_output_ * output_h_ * output_w_, "");
+    CV_CheckTypeEQ(top.type(), (use_half_) ? CV_16SC1 : CV_32FC1, "");
+    top.setTo(Scalar::all(0));
+
     bool saved_tuned = tuned_;
     tuned_ = false;
     convolve(bottom, top, weight, bias, numImages, config);
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 98714bbd5c29..91880cba6a23 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -41,6 +41,8 @@ CV__DNN_INLINE_NS_BEGIN
 
 extern bool DNN_DIAGNOSTICS_RUN;
 
+class ONNXLayerHandler;
+
 class ONNXImporter
 {
     opencv_onnx::ModelProto model_proto;
@@ -61,60 +63,16 @@ class ONNXImporter
     void addConstant(const std::string& name, const Mat& blob);
     void addLayer(LayerParams& layerParams,
                   const opencv_onnx::NodeProto& node_proto);
-    static const std::set<String>& getSupportedTypes();
 
 public:
-
-    ONNXImporter(Net& net, const char *onnxFile)
-        : dstNet(net), utilNet()
-    {
-        hasDynamicShapes = false;
-        CV_Assert(onnxFile);
-        CV_LOG_DEBUG(NULL, "DNN/ONNX: processing ONNX model from file: " << onnxFile);
-
-        std::fstream input(onnxFile, std::ios::in | std::ios::binary);
-        if (!input)
-        {
-            CV_Error(Error::StsBadArg, cv::format("Can't read ONNX file: %s", onnxFile));
-        }
-
-        if (!model_proto.ParseFromIstream(&input))
-        {
-            CV_Error(Error::StsUnsupportedFormat, cv::format("Failed to parse ONNX model: %s", onnxFile));
-        }
-
-        populateNet();
-    }
-
-    ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer)
-        : dstNet(net), utilNet()
-    {
-        hasDynamicShapes = false;
-        CV_LOG_DEBUG(NULL, "DNN/ONNX: processing in-memory ONNX model (" << sizeBuffer << " bytes)");
-
-        struct _Buf : public std::streambuf
-        {
-            _Buf(const char* buffer, size_t sizeBuffer)
-            {
-                char* p = const_cast<char*>(buffer);
-                setg(p, p, p + sizeBuffer);
-            }
-        };
-
-        _Buf buf(buffer, sizeBuffer);
-        std::istream input(&buf);
-
-        if (!model_proto.ParseFromIstream(&input))
-            CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model from in-memory byte array.");
-
-        populateNet();
-    }
+    ONNXImporter(Net& net, const char *onnxFile);
+    ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer);
 
     void populateNet();
 
 protected:
+    std::unique_ptr<ONNXLayerHandler> layerHandler;
     Net& dstNet;
-    Net utilNet;
 
     opencv_onnx::GraphProto graph_proto;
     std::string framework_name;
@@ -129,8 +87,137 @@ class ONNXImporter
     typedef std::map<std::string, LayerInfo>::iterator IterLayerId_t;
 
     void handleNode(const opencv_onnx::NodeProto& node_proto);
+
+private:
+    friend class ONNXLayerHandler;
+    typedef void (ONNXImporter::*ONNXImporterNodeParser)(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    typedef std::map<std::string, ONNXImporterNodeParser> DispatchMap;
+
+    const DispatchMap dispatch;
+    static const DispatchMap buildDispatchMap();
+
+    void parseMaxPool              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseAveragePool          (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseReduce               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseSlice                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseSplit                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseBias                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parsePow                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseMax                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseNeg                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseConstant             (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseLSTM                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseGRU                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseImageScaler          (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseClip                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseLeakyRelu            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseRelu                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseElu                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseTanh                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parsePRelu                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseLRN                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseInstanceNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseBatchNormalization   (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseGemm                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseMatMul               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseMul                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseConv                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseConvTranspose        (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseTranspose            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseSqueeze              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseFlatten              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseUnsqueeze            (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseExpand               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseReshape              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parsePad                  (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseShape                (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseCast                 (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseConstantFill         (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseGather               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseConcat               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseResize               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseUpsample             (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseSoftMax              (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseDetectionOutput      (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+    void parseCumSum               (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+
+    void parseCustomLayer          (LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto);
+};
+
+class ONNXLayerHandler : public detail::LayerHandler
+{
+public:
+    explicit ONNXLayerHandler(ONNXImporter* importer_);
+
+    void fillRegistry(const opencv_onnx::GraphProto& net);
+
+protected:
+    ONNXImporter* importer;
 };
 
+ONNXLayerHandler::ONNXLayerHandler(ONNXImporter* importer_) : importer(importer_){}
+
+void ONNXLayerHandler::fillRegistry(const opencv_onnx::GraphProto &net)
+{
+    int layersSize = net.node_size();
+    for (int li = 0; li < layersSize; li++) {
+        const opencv_onnx::NodeProto &node_proto = net.node(li);
+        const std::string& name = node_proto.output(0);
+        const std::string& type = node_proto.op_type();
+        if (importer->dispatch.find(type) == importer->dispatch.end())
+        {
+            addMissing(name, type);
+        }
+    }
+    printMissing();
+}
+
+ONNXImporter::ONNXImporter(Net& net, const char *onnxFile)
+    : layerHandler(DNN_DIAGNOSTICS_RUN ?  new ONNXLayerHandler(this) : nullptr),
+        dstNet(net), dispatch(buildDispatchMap())
+{
+    hasDynamicShapes = false;
+    CV_Assert(onnxFile);
+    CV_LOG_DEBUG(NULL, "DNN/ONNX: processing ONNX model from file: " << onnxFile);
+
+    std::fstream input(onnxFile, std::ios::in | std::ios::binary);
+    if (!input)
+    {
+        CV_Error(Error::StsBadArg, cv::format("Can't read ONNX file: %s", onnxFile));
+    }
+
+    if (!model_proto.ParseFromIstream(&input))
+    {
+        CV_Error(Error::StsUnsupportedFormat, cv::format("Failed to parse ONNX model: %s", onnxFile));
+    }
+
+    populateNet();
+}
+
+ONNXImporter::ONNXImporter(Net& net, const char* buffer, size_t sizeBuffer)
+    : layerHandler(DNN_DIAGNOSTICS_RUN ?  new ONNXLayerHandler(this) : nullptr), dstNet(net), dispatch(buildDispatchMap())
+{
+    hasDynamicShapes = false;
+    CV_LOG_DEBUG(NULL, "DNN/ONNX: processing in-memory ONNX model (" << sizeBuffer << " bytes)");
+
+    struct _Buf : public std::streambuf
+            {
+        _Buf(const char* buffer, size_t sizeBuffer)
+        {
+            char* p = const_cast<char*>(buffer);
+            setg(p, p, p + sizeBuffer);
+        }
+            };
+
+    _Buf buf(buffer, sizeBuffer);
+    std::istream input(&buf);
+
+    if (!model_proto.ParseFromIstream(&input))
+        CV_Error(Error::StsUnsupportedFormat, "Failed to parse onnx model from in-memory byte array.");
+
+    populateNet();
+}
+
 inline void replaceLayerParam(LayerParams& layerParams, const String& oldKey, const String& newKey)
 {
     if (layerParams.has(oldKey)) {
@@ -202,6 +289,10 @@ static DictValue parse(const ::google::protobuf::RepeatedField< ::google::protob
     return DictValue::arrayInt(&dst[0], src.size());
 }
 
+static DictValue parseStr(const ::google::protobuf::RepeatedPtrField< ::std::string>& src) {
+    return DictValue::arrayString(src.begin(), static_cast<int>(src.size()));
+}
+
 LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_proto)
 {
     LayerParams lp;
@@ -261,6 +352,10 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
                 CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
                 lp.set("dilation", parse(attribute_proto.ints()));
             }
+            else if(attribute_name == "activations" && node_proto.op_type() == "LSTM")
+            {
+                lp.set(attribute_name, parseStr(attribute_proto.strings()));
+            }
             else if (attribute_proto.has_i())
             {
                 ::google::protobuf::int64 src = attribute_proto.i();
@@ -361,11 +456,7 @@ Mat ONNXImporter::getBlob(const std::string& input_name)
 void ONNXImporter::addLayer(LayerParams& layerParams,
                             const opencv_onnx::NodeProto& node_proto)
 {
-    int id;
-    if (DNN_DIAGNOSTICS_RUN)
-        id = utilNet.addLayer(layerParams.name, layerParams.type, layerParams);
-    else
-        id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
+    int id = dstNet.addLayer(layerParams.name, layerParams.type, layerParams);
     for (int i = 0; i < node_proto.output_size(); ++i)
     {
         layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(id, i)));
@@ -378,10 +469,7 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
         const std::string& input_name = node_proto.input(j);
         IterLayerId_t layerId = layer_id.find(input_name);
         if (layerId != layer_id.end()) {
-            if (DNN_DIAGNOSTICS_RUN)
-                utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
-            else
-                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
+            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, inpNum);
             ++inpNum;
             // Collect input shapes.
             IterShape_t shapeIt = outShapes.find(input_name);
@@ -390,11 +478,7 @@ void ONNXImporter::addLayer(LayerParams& layerParams,
         }
     }
     // Compute shape of output blob for this layer.
-    Ptr<Layer> layer;
-    if (DNN_DIAGNOSTICS_RUN)
-        layer = utilNet.getLayer(id);
-    else
-        layer = dstNet.getLayer(id);  // FIXIT: avoid instantiation of layers during the import stage
+    Ptr<Layer> layer = dstNet.getLayer(id);  // FIXIT: avoid instantiation of layers during the import stage
     layer->getMemoryShapes(layerInpShapes, 0, layerOutShapes, layerInternalShapes);
     for (int i = 0; i < node_proto.output_size() && i < (int)layerOutShapes.size(); ++i)
     {
@@ -471,35 +555,11 @@ void ONNXImporter::populateNet()
             layer_id.insert(std::make_pair(name, LayerInfo(0, netInputs.size() - 1)));
         }
     }
-    utilNet.setInputsNames(netInputs);
     dstNet.setInputsNames(netInputs);
 
     if (DNN_DIAGNOSTICS_RUN) {
-        auto &supportedTypes = getSupportedTypes();
-        for (int li = 0; li < layersSize; li++) {
-            const opencv_onnx::NodeProto &node_proto = graph_proto.node(li);
-            std::string name = node_proto.output(0);
-            std::string layer_type = node_proto.op_type();
-            auto registered = supportedTypes.find(layer_type);
-            if (registered == supportedTypes.end()) {
-                CV_LOG_ERROR(NULL, "DNN/ONNX: NOTE: Potential problem with creating node " << name<< " with type " << layer_type << ".\n Type "
-                    << layer_type << " IS NOT SUPPORTED!\n"
-                );
-            }
-        }
-        auto oldConstBlobs = constBlobs;
-        auto oldOutShapes = outShapes;
-        auto oldLayerId = layer_id;
         CV_LOG_INFO(NULL, "DNN/ONNX: start diagnostic run!");
-        for (int li = 0; li < layersSize; li++) {
-            const opencv_onnx::NodeProto &node_proto = graph_proto.node(li);
-            handleNode(node_proto);
-        }
-        CV_LOG_INFO(NULL, "DNN/ONNX: diagnostic run completed!");
-        constBlobs = oldConstBlobs;
-        outShapes = oldOutShapes;
-        layer_id = oldLayerId;
-        enableModelDiagnostics(false);
+        layerHandler->fillRegistry(graph_proto);
     }
 
     for(int li = 0; li < layersSize; li++)
@@ -508,90 +568,14 @@ void ONNXImporter::populateNet()
         handleNode(node_proto);
     }
 
-    CV_LOG_DEBUG(NULL, "DNN/ONNX: import completed!");
-}
-
-const std::set<String>& ONNXImporter::getSupportedTypes()
-{
-    static const std::set<String> layerTypes = {
-        "MaxPool",
-        "AveragePool",
-        "GlobalAveragePool",
-        "GlobalMaxPool",
-        "ReduceMean",
-        "ReduceSum",
-        "ReduceMax",
-        "Slice",
-        "Split",
-        "Add",
-        "Sum",
-        "Sub",
-        "Pow",
-        "Max",
-        "Neg",
-        "Constant",
-        "LSTM",
-        "ImageScaler",
-        "Clip",
-        "LeakyRelu",
-        "Relu",
-        "Elu",
-        "Tanh",
-        "PRelu",
-        "LRN",
-        "InstanceNormalization",
-        "BatchNormalization",
-        "Gemm",
-        "MatMul",
-        "Mul",
-        "Div",
-        "Conv",
-        "ConvTranspose",
-        "Transpose",
-        "Squeeze",
-        "Flatten",
-        "Unsqueeze",
-        "Expand",
-        "Reshape",
-        "Pad",
-        "Shape",
-        "Cast",
-        "ConstantOfShape",
-        "ConstantFill",
-        "Gather",
-        "Concat",
-        "Resize",
-        "Upsample",
-        "SoftMax",
-        "Softmax",
-        "LogSoftmax",
-        "DetectionOutput",
-        "Interp",
-        "CropAndResize",
-        "ROIPooling",
-        "PSROIPooling",
-        "ChannelsPReLU",
-        "Sigmoid",
-        "Swish",
-        "Mish",
-        "AbsVal",
-        "BNLL",
-        "MaxUnpool",
-        "Dropout",
-        "Identity",
-        "Crop",
-        "Normalize"
-    };
-    return layerTypes;
+    CV_LOG_DEBUG(NULL, (DNN_DIAGNOSTICS_RUN ? "DNN/ONNX: diagnostic run completed!" : "DNN/ONNX: import completed!"));
 }
 
-void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
+void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto)
 {
-    opencv_onnx::NodeProto node_proto = node_proto_;  // TODO FIXIT
-
     CV_Assert(node_proto.output_size() >= 1);
     std::string name = node_proto.output(0);
-    std::string layer_type = node_proto.op_type();
+    const std::string& layer_type = node_proto.op_type();
     CV_LOG_DEBUG(NULL, "DNN/ONNX: processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
             << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
     );
@@ -605,1543 +589,1804 @@ void ONNXImporter::handleNode(const opencv_onnx::NodeProto& node_proto_)
         layerParams.type = layer_type;
         layerParams.set("has_dynamic_shapes", hasDynamicShapes);
 
-        if (layer_type == "MaxPool")
+        DispatchMap::const_iterator iter = dispatch.find(layer_type);
+        if (iter != dispatch.end())
         {
-            layerParams.type = "Pooling";
-            layerParams.set("pool", "MAX");
-            layerParams.set("ceil_mode", layerParams.has("pad_mode"));
+            CALL_MEMBER_FN(*this, iter->second)(layerParams, node_proto);
         }
-        else if (layer_type == "AveragePool")
+        else
         {
-            layerParams.type = "Pooling";
-            layerParams.set("pool", "AVE");
-            layerParams.set("ceil_mode", layerParams.has("pad_mode"));
-            layerParams.set("ave_pool_padded_area", framework_name == "pytorch");
+            parseCustomLayer(layerParams, node_proto);
         }
-        else if (layer_type == "GlobalAveragePool" || layer_type == "GlobalMaxPool" ||
-                layer_type == "ReduceMean" || layer_type == "ReduceSum" || layer_type == "ReduceMax")
+    }
+    catch (const cv::Exception& e)
+    {
+        if (DNN_DIAGNOSTICS_RUN)
         {
-            CV_Assert(node_proto.input_size() == 1);
-            layerParams.type = "Pooling";
-            String pool;
-            if (layer_type == "GlobalMaxPool" || layer_type == "ReduceMax")
-                pool = "MAX";
-            else if (layer_type == "ReduceSum")
-                pool = "SUM";
-            else
-                pool = "AVE";
-            layerParams.set("pool", pool);
-            layerParams.set("global_pooling", !layerParams.has("axes"));
-            if (layerParams.has("axes") && (layer_type == "ReduceMean" || layer_type == "ReduceSum" || layer_type == "ReduceMax"))
+            CV_LOG_ERROR(NULL, "DNN/ONNX: Potential problem during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
+                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str()) << "\n" << e.msg
+            );
+            cv::AutoLock lock(getLayerFactoryMutex());
+            auto registeredLayers = getLayerFactoryImpl();
+            if (registeredLayers.find(layerParams.type) != registeredLayers.end())
             {
-                MatShape inpShape = outShapes[node_proto.input(0)];
-                DictValue axes = layerParams.get("axes");
-                bool keepdims = layerParams.get<int>("keepdims");
-                MatShape targetShape;
-                std::vector<bool> shouldDelete(inpShape.size(), false);
-                for (int i = 0; i < axes.size(); i++) {
-                    int axis = normalize_axis(axes.get<int>(i), inpShape.size());
-                    shouldDelete[axis] = true;
-                }
-                for (int axis = 0; axis < inpShape.size(); ++axis){
-                    if (!shouldDelete[axis])
-                        targetShape.push_back(inpShape[axis]);
-                    else if (keepdims)
-                        targetShape.push_back(1);
-                }
-
-                if (inpShape.size() == 3 && axes.size() <= 2)
+                try
                 {
-                    int axis = normalize_axis(axes.get<int>(0), inpShape.size());
-                    CV_CheckNE(axis, 0, "");
-
-                    LayerParams reshapeLp;
-                    reshapeLp.name = layerParams.name + "/reshape";
-                    reshapeLp.type = "Reshape";
-                    CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
-                    reshapeLp.set("axis", 0);
-                    reshapeLp.set("num_axes", 1);
-                    int newShape[] = {1, -1};
-                    reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 2));
-
-                    opencv_onnx::NodeProto proto;
-                    proto.add_input(node_proto.input(0));
-                    proto.add_output(reshapeLp.name);
-                    addLayer(reshapeLp, proto);
-
-                    LayerParams avgLp;
-                    avgLp.name = layerParams.name + "/avg";
-                    avgLp.type = "Pooling";
-                    CV_Assert(layer_id.find(avgLp.name) == layer_id.end());
-                    avgLp.set("pool", pool);
-                    if (axes.size() == 2)
-                    {
-                        CV_CheckEQ(normalize_axis(axes.get<int>(0), inpShape.size()), 1, "Unsupported mode");
-                        CV_CheckEQ(normalize_axis(axes.get<int>(1), inpShape.size()), 2, "Unsupported mode");
-                        avgLp.set("global_pooling", true);
-                    }
-                    else
-                    {
-                        avgLp.set(axis == 2 ? "global_pooling_w" : "global_pooling_h", true);
-                        avgLp.set(axis == 2 ? "kernel_h" : "kernel_w", 1);
-                    }
-
-                    node_proto.set_input(0, reshapeLp.name);
-                    node_proto.set_output(0, avgLp.name);
-                    addLayer(avgLp, node_proto);
+                    Ptr<Layer> layer = LayerFactory::createLayerInstance(layerParams.type, layerParams);
                 }
-                else
+                catch (const std::exception& e)
                 {
-                    if (inpShape.size() != 4 && inpShape.size() != 5)
-                        CV_Error(Error::StsNotImplemented, "Unsupported input shape of " + layer_type + " operation.");
-
-                    CV_Assert(axes.size() <= inpShape.size() - 2);
-                    std::vector<int> kernel_size(inpShape.size() - 2, 1);
-                    if (axes.size() == 1 && (normalize_axis(axes.get<int>(0), inpShape.size()) <= 1))
-                    {
-                        int axis = normalize_axis(axes.get<int>(0), inpShape.size());
-                        MatShape newShape = inpShape;
-                        newShape[axis + 1] = total(newShape, axis + 1);
-                        newShape.resize(axis + 2);
-                        newShape.insert(newShape.begin(), 2 - axis, 1);
-
-                        LayerParams reshapeLp;
-                        reshapeLp.type = "Reshape";
-                        reshapeLp.name = layerParams.name + "/reshape";
-                        CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
-                        reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], newShape.size()));
-
-                        node_proto.set_output(0, reshapeLp.name);
-                        addLayer(reshapeLp, node_proto);
-
-                        kernel_size.resize(2);
-                        kernel_size[0] = inpShape[axis];
-                        node_proto.set_input(0, node_proto.output(0));
-                    }
-                    else
-                    {
-                        for (int i = 0; i < axes.size(); i++) {
-                            int axis = normalize_axis(axes.get<int>(i), inpShape.size());
-                            CV_Assert_N(axis >= 2 + i, axis < inpShape.size());
-                            kernel_size[axis - 2] = inpShape[axis];
-                        }
-                    }
-
-                    LayerParams poolLp = layerParams;
-                    poolLp.name = layerParams.name + "/avg";
-                    CV_Assert(layer_id.find(poolLp.name) == layer_id.end());
-                    poolLp.set("kernel_size", DictValue::arrayInt(&kernel_size[0], kernel_size.size()));
-
-                    node_proto.set_output(0, poolLp.name);
-                    addLayer(poolLp, node_proto);
+                    CV_LOG_ERROR(NULL, "DNN/ONNX: Layer of type " << layerParams.type << "(" << layer_type << ") cannot be created with parameters " << layerParams << ". Error: " << e.what()
+                    );
                 }
-
-                layerParams.type = "Reshape";
-                layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
-
-                node_proto.set_input(0, node_proto.output(0));
-                node_proto.set_output(0, layerParams.name);
             }
-            else if (!layerParams.has("axes") && (layer_type == "ReduceMean" || layer_type == "ReduceSum" || layer_type == "ReduceMax"))
+        }
+        else
+        {
+            CV_LOG_ERROR(NULL, "DNN/ONNX: ERROR during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
+                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
+            );
+        }
+        for (int i = 0; i < node_proto.input_size(); i++)
+        {
+            CV_LOG_INFO(NULL, "    Input[" << i << "] = '" << node_proto.input(i) << "'");
+        }
+        for (int i = 0; i < node_proto.output_size(); i++)
+        {
+            CV_LOG_INFO(NULL, "    Output[" << i << "] = '" << node_proto.output(i) << "'");
+        }
+        if (DNN_DIAGNOSTICS_RUN)
+        {
+            for (int i = 0; i < node_proto.output_size(); ++i)
             {
-                CV_CheckEQ(layerParams.get<int>("keepdims"), 0, "layer only supports keepdims = false");
-                LayerParams reshapeLp;
-                reshapeLp.name = layerParams.name + "/reshape";
-                reshapeLp.type = "Reshape";
-                CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
-                int newShape[] = {1, 1, 1, -1};
-                reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 4));
-
-                opencv_onnx::NodeProto proto;
-                proto.add_input(node_proto.input(0));
-                proto.add_output(reshapeLp.name);
-                addLayer(reshapeLp, proto);
+                layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(0, i)));
+                outShapes[node_proto.output(i)] = outShapes[node_proto.input(0)];
+            }
+        }
+        else
+            CV_Error(Error::StsError, cv::format("Node [%s]:(%s) parse error: %s", layer_type.c_str(), name.c_str(), e.what()));
+    }
+}
 
-                LayerParams poolLp = layerParams;
-                poolLp.name = layerParams.name + "/pool";
-                CV_Assert(layer_id.find(poolLp.name) == layer_id.end());
+void ONNXImporter::parseMaxPool(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Pooling";
+    layerParams.set("pool", "MAX");
+    layerParams.set("ceil_mode", layerParams.has("pad_mode"));
+    addLayer(layerParams, node_proto);
+}
 
-                node_proto.set_input(0, reshapeLp.name);
-                node_proto.set_output(0, poolLp.name);
-                addLayer(poolLp, node_proto);
+void ONNXImporter::parseAveragePool(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Pooling";
+    layerParams.set("pool", "AVE");
+    layerParams.set("ceil_mode", layerParams.has("pad_mode"));
+    layerParams.set("ave_pool_padded_area", framework_name == "pytorch");
+    addLayer(layerParams, node_proto);
+}
 
-                layerParams.type = "Reshape";
-                int targetShape[] = {1};
-                layerParams.set("dim", DictValue::arrayInt(&targetShape[0], 1));
+void ONNXImporter::parseReduce(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    const std::string& layer_type = node_proto.op_type();
+
+    CV_Assert(node_proto.input_size() == 1);
+    layerParams.type = "Pooling";
+    String pool;
+    if (layer_type == "GlobalMaxPool" || layer_type == "ReduceMax")
+        pool = "MAX";
+    else if (layer_type == "ReduceSum")
+        pool = "SUM";
+    else
+        pool = "AVE";
+    layerParams.set("pool", pool);
+    layerParams.set("global_pooling", !layerParams.has("axes"));
+    if (layerParams.has("axes") && (layer_type == "ReduceMean" || layer_type == "ReduceSum" || layer_type == "ReduceMax"))
+    {
+        MatShape inpShape = outShapes[node_proto.input(0)];
+        DictValue axes = layerParams.get("axes");
+        bool keepdims = layerParams.get<int>("keepdims");
+        MatShape targetShape;
+        std::vector<bool> shouldDelete(inpShape.size(), false);
+        for (int i = 0; i < axes.size(); i++) {
+            int axis = normalize_axis(axes.get<int>(i), inpShape.size());
+            shouldDelete[axis] = true;
+        }
+        for (int axis = 0; axis < inpShape.size(); ++axis){
+            if (!shouldDelete[axis])
+                targetShape.push_back(inpShape[axis]);
+            else if (keepdims)
+                targetShape.push_back(1);
+        }
 
-                node_proto.set_input(0, node_proto.output(0));
-                node_proto.set_output(0, layerParams.name);
+        if (inpShape.size() == 3 && axes.size() <= 2)
+        {
+            int axis = normalize_axis(axes.get<int>(0), inpShape.size());
+            CV_CheckNE(axis, 0, "");
+
+            LayerParams reshapeLp;
+            reshapeLp.name = layerParams.name + "/reshape";
+            reshapeLp.type = "Reshape";
+            CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+            reshapeLp.set("axis", 0);
+            reshapeLp.set("num_axes", 1);
+            int newShape[] = {1, -1};
+            reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 2));
+
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(0));
+            proto.add_output(reshapeLp.name);
+            addLayer(reshapeLp, proto);
+
+            LayerParams avgLp;
+            avgLp.name = layerParams.name + "/avg";
+            avgLp.type = "Pooling";
+            CV_Assert(layer_id.find(avgLp.name) == layer_id.end());
+            avgLp.set("pool", pool);
+            if (axes.size() == 2)
+            {
+                CV_CheckEQ(normalize_axis(axes.get<int>(0), inpShape.size()), 1, "Unsupported mode");
+                CV_CheckEQ(normalize_axis(axes.get<int>(1), inpShape.size()), 2, "Unsupported mode");
+                avgLp.set("global_pooling", true);
+            }
+            else
+            {
+                avgLp.set(axis == 2 ? "global_pooling_w" : "global_pooling_h", true);
+                avgLp.set(axis == 2 ? "kernel_h" : "kernel_w", 1);
             }
+
+            node_proto.set_input(0, reshapeLp.name);
+            node_proto.set_output(0, avgLp.name);
+            addLayer(avgLp, node_proto);
         }
-        else if (layer_type == "Slice")
+        else
         {
-            int axis = 0;
-            std::vector<int> begin;
-            std::vector<int> end;
-            std::vector<int> steps;
-            int inp_size = node_proto.input_size();
+            if (inpShape.size() != 4 && inpShape.size() != 5)
+                CV_Error(Error::StsNotImplemented, "Unsupported input shape of " + layer_type + " operation.");
 
-            if (inp_size == 1)
+            CV_Assert(axes.size() <= inpShape.size() - 2);
+            std::vector<int> kernel_size(inpShape.size() - 2, 1);
+            if (axes.size() == 1 && (normalize_axis(axes.get<int>(0), inpShape.size()) <= 1))
             {
-                if (layerParams.has("axes")) {
-                    DictValue axes = layerParams.get("axes");
-                    for (int i = 1; i < axes.size(); ++i) {
-                        CV_Assert(axes.get<int>(i - 1) == axes.get<int>(i) - 1);
-                    }
-                    axis = axes.get<int>(0);
-                }
+                int axis = normalize_axis(axes.get<int>(0), inpShape.size());
+                MatShape newShape = inpShape;
+                newShape[axis + 1] = total(newShape, axis + 1);
+                newShape.resize(axis + 2);
+                newShape.insert(newShape.begin(), 2 - axis, 1);
 
-                DictValue starts = layerParams.get("starts");
-                DictValue ends = layerParams.get("ends");
-                CV_Assert(starts.size() == ends.size());
+                LayerParams reshapeLp;
+                reshapeLp.type = "Reshape";
+                reshapeLp.name = layerParams.name + "/reshape";
+                CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+                reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], newShape.size()));
 
-                if (axis > 0) {
-                    begin.resize(axis, 0);
-                    end.resize(axis, -1);
-                }
-                for (int i = 0; i < starts.size(); ++i)
-                {
-                    begin.push_back(starts.get<int>(i));
-                    int finish = ends.get<int>(i);
-                    end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
-                }
-            } else { // inp_size > 1
-                CV_Assert(inp_size >= 3);
-                for (int i = 1; i < inp_size; i++) {
-                    CV_Assert(constBlobs.find(node_proto.input(i)) != constBlobs.end());
-                }
-                Mat start_blob = getBlob(node_proto, 1);
-                Mat end_blob   = getBlob(node_proto, 2);
-                CV_Assert(start_blob.total() == end_blob.total());
-
-                if (inp_size > 3) {
-                    Mat axes_blob = getBlob(node_proto, 3);
-                    const int* axes = (int*)axes_blob.data;
-                    for (int i = 1; i < axes_blob.total(); ++i) {
-                        CV_Assert(axes[i - 1] == axes[i] - 1);
-                    }
-                    axis = axes[0];
-                }
+                node_proto.set_output(0, reshapeLp.name);
+                addLayer(reshapeLp, node_proto);
 
-                const int* starts = start_blob.ptr<int>();
-                const int* ends   = end_blob.ptr<int>();
-                if (axis > 0) {
-                    begin.resize(axis, 0);
-                    end.resize(axis, -1);
-                }
-                std::copy(starts, starts + start_blob.total(), std::back_inserter(begin));
-                for (int i = 0; i < end_blob.total(); ++i)
-                {
-                    int finish = ends[i];
-                    end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
+                kernel_size.resize(2);
+                kernel_size[0] = inpShape[axis];
+                node_proto.set_input(0, node_proto.output(0));
+            }
+            else
+            {
+                for (int i = 0; i < axes.size(); i++) {
+                    int axis = normalize_axis(axes.get<int>(i), inpShape.size());
+                    CV_Assert_N(axis >= 2 + i, axis < inpShape.size());
+                    kernel_size[axis - 2] = inpShape[axis];
                 }
+            }
 
-                if (inp_size == 5) {
-                    CV_Assert(constBlobs.find(node_proto.input(4)) != constBlobs.end());
-                    Mat step_blob = getBlob(node_proto, 4);
-                    const int* steps_ptr = step_blob.ptr<int>();
+            LayerParams poolLp = layerParams;
+            poolLp.name = layerParams.name + "/avg";
+            CV_Assert(layer_id.find(poolLp.name) == layer_id.end());
+            poolLp.set("kernel_size", DictValue::arrayInt(&kernel_size[0], kernel_size.size()));
 
-                    if (axis > 0)
-                        steps.resize(axis, 1);
+            node_proto.set_output(0, poolLp.name);
+            addLayer(poolLp, node_proto);
+        }
 
-                    std::copy(steps_ptr, steps_ptr + step_blob.total(), std::back_inserter(steps));
+        layerParams.type = "Reshape";
+        layerParams.set("dim", DictValue::arrayInt(&targetShape[0], targetShape.size()));
 
-                    // Very strange application for Slice op with tensor reversing.
-                    // We just workaround it for 2d constants.
-                    if (constBlobs.find(node_proto.input(0)) != constBlobs.end() &&
-                        axis == 0 &&
-                        start_blob.at<int>(0) == -1 && step_blob.at<int>(0) == -1 &&
-                        end_blob.at<int>(0) == std::numeric_limits<int32_t>::min())
-                    {
-                        Mat inp = getBlob(node_proto, 0);
-                        if (inp.dims == 2)
-                        {
-                            Mat flipped;
-                            flip(inp, flipped, 0);
-                            addConstant(layerParams.name, flipped);
-                            return;
-                        }
-                    }
-                }
-            }
-            layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
-            layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
-            layerParams.set("axis", axis);
+        node_proto.set_input(0, node_proto.output(0));
+        node_proto.set_output(0, layerParams.name);
+    }
+    else if (!layerParams.has("axes") && (layer_type == "ReduceMean" || layer_type == "ReduceSum" || layer_type == "ReduceMax"))
+    {
+        CV_CheckEQ(layerParams.get<int>("keepdims"), 0, "layer only supports keepdims = false");
+        LayerParams reshapeLp;
+        reshapeLp.name = layerParams.name + "/reshape";
+        reshapeLp.type = "Reshape";
+        CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+        int newShape[] = {1, 1, 1, -1};
+        reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 4));
+
+        opencv_onnx::NodeProto proto;
+        proto.add_input(node_proto.input(0));
+        proto.add_output(reshapeLp.name);
+        addLayer(reshapeLp, proto);
+
+        LayerParams poolLp = layerParams;
+        poolLp.name = layerParams.name + "/pool";
+        CV_Assert(layer_id.find(poolLp.name) == layer_id.end());
+
+        node_proto.set_input(0, reshapeLp.name);
+        node_proto.set_output(0, poolLp.name);
+        addLayer(poolLp, node_proto);
+
+        layerParams.type = "Reshape";
+        int targetShape[] = {1};
+        layerParams.set("dim", DictValue::arrayInt(&targetShape[0], 1));
+
+        node_proto.set_input(0, node_proto.output(0));
+        node_proto.set_output(0, layerParams.name);
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            if (!steps.empty())
-                layerParams.set("steps", DictValue::arrayInt(&steps[0], steps.size()));
+void ONNXImporter::parseSlice(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    int axis = 0;
+    std::vector<int> begin;
+    std::vector<int> end;
+    std::vector<int> steps;
+    int inp_size = node_proto.input_size();
 
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                Mat inp = getBlob(node_proto, 0);
-                std::vector<Mat> inputs, sliced;
-                inputs.push_back(inp);
-                runLayer(layerParams, inputs, sliced);
-                CV_Assert(sliced.size() == 1);
-                addConstant(layerParams.name, sliced[0]);
-                return;
+    if (inp_size == 1)
+    {
+        if (layerParams.has("axes")) {
+            DictValue axes = layerParams.get("axes");
+            for (int i = 1; i < axes.size(); ++i) {
+                CV_Assert(axes.get<int>(i - 1) == axes.get<int>(i) - 1);
             }
+            axis = axes.get<int>(0);
         }
-        else if (layer_type == "Split")
-        {
-            if (layerParams.has("split"))
-            {
-                DictValue splits = layerParams.get("split");
-                const int numSplits = splits.size();
-                CV_Assert(numSplits > 1);
 
-                std::vector<int> slicePoints(numSplits - 1, splits.get<int>(0));
-                for (int i = 1; i < splits.size() - 1; ++i)
-                {
-                    slicePoints[i] = slicePoints[i - 1] + splits.get<int>(i - 1);
-                }
-                layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
-            }
-            else
-            {
-                layerParams.set("num_split", node_proto.output_size());
-            }
-            layerParams.type = "Slice";
+        DictValue starts = layerParams.get("starts");
+        DictValue ends = layerParams.get("ends");
+        CV_Assert(starts.size() == ends.size());
+
+        if (axis > 0) {
+            begin.resize(axis, 0);
+            end.resize(axis, -1);
         }
-        else if (layer_type == "Add" || layer_type == "Sum" || layer_type == "Sub")
+        for (int i = 0; i < starts.size(); ++i)
         {
-            bool isSub = layer_type == "Sub";
-            CV_CheckEQ(node_proto.input_size(), 2, "");
-            bool is_const_0 = layer_id.find(node_proto.input(0)) == layer_id.end();
-            bool is_const_1 = layer_id.find(node_proto.input(1)) == layer_id.end();
-            if (is_const_0 && is_const_1)
-            {
-                Mat blob_0 = getBlob(node_proto, 0);
-                Mat blob_1 = getBlob(node_proto, 1);
-                CV_Assert(blob_0.size == blob_1.size);
-                Mat output = isSub ? (blob_0 - blob_1) : (blob_0 + blob_1);
-                addConstant(layerParams.name, output);
-                return;
-            }
-            else if (is_const_0 || is_const_1)
-            {
-                int const_blob_id = is_const_0 ? 0 : 1;
-                Mat blob = getBlob(node_proto, const_blob_id);
-                int blob_total = blob.total();
-                if (blob_total == 1) {
-                    layerParams.type = "Power";
-                    layerParams.set("shift", (isSub ? -1 : 1) * blob.at<float>(0));
-                }
-                else {
-                    MatShape inpShape = outShapes[node_proto.input(1 - const_blob_id)];
-                    if (shape(blob) == inpShape)
-                    {
-                        LayerParams constParams;
-                        constParams.name = layerParams.name + "/const";
-                        constParams.type = "Const";
-                        constParams.blobs.push_back((isSub ? -1 : 1) * blob);
-                        int id;
-                        if (DNN_DIAGNOSTICS_RUN)
-                            id = utilNet.addLayer(constParams.name, constParams.type, constParams);
-                        else
-                            id = dstNet.addLayer(constParams.name, constParams.type, constParams);
-                        layer_id.insert(std::make_pair(constParams.name, LayerInfo(id, 0)));
-                        outShapes[constParams.name] = shape(blob);
-
-                        layerParams.type = "Eltwise";
-                        node_proto.set_input(const_blob_id, constParams.name);
-                    }
-                    else
-                    {
-                        layerParams.type = "Scale";
-                        layerParams.set("bias_term", true);
-                        int axis = 1;
-                        for (int i = 0; i < graph_proto.initializer_size(); i++)
-                        {
-                            opencv_onnx::TensorProto tensor_proto = graph_proto.initializer(i);
-                            if (tensor_proto.name() == node_proto.input(const_blob_id))
-                            {
-                                axis = inpShape.size() - tensor_proto.dims_size();
-                                break;
-                            }
-                        }
-                        layerParams.set("axis", axis);
-                        blob = blob.reshape(1, 1);
-                        layerParams.blobs.push_back((isSub ? -1 : 1) * blob);
-                    }
-                }
-            }
-            else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
-            {
-                layerParams.type = "Eltwise";
-                if (isSub)
-                {
-                    static float subCoeffs[] = {1.f, -1.f};
-                    layerParams.set("coeff", DictValue::arrayReal<float*>(subCoeffs, 2));
-                }
-            }
-            else
-            {
-                if (isSub)
-                {
-                    LayerParams powerParams;
-                    powerParams.name = layerParams.name + "/neg";
-                    powerParams.type = "Power";
-                    powerParams.set("scale", -1);
-
-                    int id;
-                    //Create Power layer
-                    if (DNN_DIAGNOSTICS_RUN)
-                        id = utilNet.addLayer(powerParams.name, powerParams.type, powerParams);
-                    else
-                        id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
-                    //Connect to input
-                    IterLayerId_t layerId = layer_id.find(node_proto.input(1));
-                    CV_Assert(layerId != layer_id.end());
-                    if (DNN_DIAGNOSTICS_RUN)
-                        utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-                    else
-                        dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-                    //Add shape
-                    layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
-                    outShapes[powerParams.name] = outShapes[node_proto.input(1)];
-
-                    //Replace input to Power
-                    node_proto.set_input(1, powerParams.name);
-                }
-                layerParams.type = "Scale";
-                layerParams.set("bias_term", true);
-            }
+            begin.push_back(starts.get<int>(i));
+            int finish = ends.get<int>(i);
+            end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
         }
-        else if (layer_type == "Pow")
-        {
-            if (layer_id.find(node_proto.input(1)) != layer_id.end())
-                CV_Error(Error::StsNotImplemented, "Unsupported Pow op with variable power");
-
-            Mat blob = getBlob(node_proto, 1);
-            if (blob.total() != 1)
-                CV_Error(Error::StsNotImplemented, "Pow op supports only scalar power");
-
-            blob.convertTo(blob, CV_32F);
-            layerParams.type = "Power";
-            layerParams.set("power", blob.at<float>(0));
+    } else { // inp_size > 1
+        CV_Assert(inp_size >= 3);
+        for (int i = 1; i < inp_size; i++) {
+            CV_Assert(constBlobs.find(node_proto.input(i)) != constBlobs.end());
         }
-        else if (layer_type == "Max")
-        {
-            layerParams.type = "Eltwise";
-            layerParams.set("operation", "max");
+        Mat start_blob = getBlob(node_proto, 1);
+        Mat end_blob   = getBlob(node_proto, 2);
+        CV_Assert(start_blob.total() == end_blob.total());
+
+        if (inp_size > 3) {
+            Mat axes_blob = getBlob(node_proto, 3);
+            const int* axes = (int*)axes_blob.data;
+            for (int i = 1; i < axes_blob.total(); ++i) {
+                CV_Assert(axes[i - 1] == axes[i] - 1);
+            }
+            axis = axes[0];
         }
-        else if (layer_type == "Neg")
-        {
-            layerParams.type = "Power";
-            layerParams.set("scale", -1);
+
+        const int* starts = start_blob.ptr<int>();
+        const int* ends   = end_blob.ptr<int>();
+        if (axis > 0) {
+            begin.resize(axis, 0);
+            end.resize(axis, -1);
         }
-        else if (layer_type == "Constant")
+        std::copy(starts, starts + start_blob.total(), std::back_inserter(begin));
+        for (int i = 0; i < end_blob.total(); ++i)
         {
-            CV_Assert(node_proto.input_size() == 0);
-            CV_Assert(layerParams.blobs.size() == 1);
-            addConstant(layerParams.name, layerParams.blobs[0]);
-            return;
+            int finish = ends[i];
+            end.push_back((finish < 0) ? --finish : finish); // numpy doesn't include last dim
         }
-        else if (layer_type == "LSTM")
-        {
-            LayerParams lstmParams = layerParams;
-            lstmParams.name += "/lstm";
-
-            // https://pytorch.org/docs/stable/nn.html#lstm
-            CV_Assert(node_proto.input_size() == 7);
-            Mat Wx = getBlob(node_proto, 1);
-            Mat Wh = getBlob(node_proto, 2);
-            Mat b = getBlob(node_proto, 3);
-            CV_CheckEQ(countNonZero(getBlob(node_proto, 5)), 0, "Unsupported non zero initial_h");
-            CV_CheckEQ(countNonZero(getBlob(node_proto, 6)), 0, "Unsupported non zero initial_c");
-            b = b.reshape(1, b.size[0]);
-
-            const int numHidden = lstmParams.get<int>("hidden_size");
-            const int numDirs = Wx.size[0];  // Is 1 for forward only and 2 for bidirectional LSTM.
-            const int numFeatures = Wx.size[2];
-            Mat bx = b.colRange(0, b.cols / 2);
-            Mat bh = b.colRange(b.cols / 2, b.cols);
-            b = bx + bh;
-
-            // IFGO->IGFO
-            for (int k = 0; k < numDirs; ++k)
-            {
-                float* WxData = Wx.ptr<float>(k);
-                float* WhData = Wh.ptr<float>(k);
-                float* biasData = b.ptr<float>(k);
-                for (int j = 0; j < numHidden; ++j)
-                {
-                    for (int i = 0; i < numFeatures; ++i)
-                    {
-                        std::swap(WxData[(numHidden + j) * numFeatures + i],
-                                  WxData[(numHidden * 2 + j) * numFeatures + i]);
-                    }
-                    for (int i = 0; i < numHidden; ++i)
-                    {
-                        std::swap(WhData[(numHidden + j) * numHidden + i],
-                                  WhData[(numHidden * 2 + j) * numHidden + i]);
-                    }
-                    std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
-                }
-            }
-            Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
-            Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
-
-            lstmParams.blobs.resize(3);
-            lstmParams.blobs[0] = Wh;
-            lstmParams.blobs[1] = Wx;
-            lstmParams.blobs[2] = b;
-            lstmParams.set("bidirectional", lstmParams.get<String>("direction", "") == "bidirectional");
-
-            node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
-            addLayer(lstmParams, node_proto);
 
-            MatShape lstmShape = outShapes[node_proto.output(0)];
+        if (inp_size == 5) {
+            CV_Assert(constBlobs.find(node_proto.input(4)) != constBlobs.end());
+            Mat step_blob = getBlob(node_proto, 4);
+            const int* steps_ptr = step_blob.ptr<int>();
 
-            // Add fake 1 as it is done in ONNX
-            lstmShape.insert(lstmShape.begin() + 1, 1);
+            if (axis > 0)
+                steps.resize(axis, 1);
 
-            layerParams.type = "Reshape";
-            layerParams.set("dim", DictValue::arrayInt(&lstmShape[0], lstmShape.size()));
-            node_proto.set_input(0, lstmParams.name);  // redirect input to LSTM
-            node_proto.set_output(0, layerParams.name);  // keep origin LSTM's name
-        }
-        else if (layer_type == "ImageScaler")
-        {
-            const float scale = layerParams.has("scale") ? layerParams.get<float>("scale") : 1.0f;
-            layerParams.erase("scale");
+            std::copy(steps_ptr, steps_ptr + step_blob.total(), std::back_inserter(steps));
 
-            if (layerParams.has("bias"))
+            // Very strange application for Slice op with tensor reversing.
+            // We just workaround it for 2d constants.
+            if (constBlobs.find(node_proto.input(0)) != constBlobs.end() &&
+                axis == 0 &&
+                start_blob.at<int>(0) == -1 && step_blob.at<int>(0) == -1 &&
+                end_blob.at<int>(0) == std::numeric_limits<int32_t>::min())
             {
-                layerParams.type = "Scale";
-                layerParams.blobs.push_back(
-                    Mat(Size(1,  layerParams.get("bias").size()), CV_32FC1, scale));
-
-                layerParams.set("bias_term", true);
-                Mat bias(1, layerParams.get("bias").size(), CV_32FC1);
-                for (int j = 0; j < bias.total(); j++) {
-                    bias.at<float>(0, j) = layerParams.get("bias").getRealValue(j);
+                Mat inp = getBlob(node_proto, 0);
+                if (inp.dims == 2)
+                {
+                    Mat flipped;
+                    flip(inp, flipped, 0);
+                    addConstant(layerParams.name, flipped);
+                    return;
                 }
-                layerParams.blobs.push_back(bias);
-                layerParams.erase("bias");
-            }
-            else {
-                layerParams.set("scale", scale);
-                layerParams.type = "Power";
             }
         }
-        else if (layer_type == "Clip")
-        {
-            layerParams.type = "ReLU6";
-            replaceLayerParam(layerParams, "min", "min_value");
-            replaceLayerParam(layerParams, "max", "max_value");
+    }
+    layerParams.set("begin", DictValue::arrayInt(&begin[0], begin.size()));
+    layerParams.set("end", DictValue::arrayInt(&end[0], end.size()));
+    layerParams.set("axis", axis);
 
-        }
-        else if (layer_type == "LeakyRelu")
-        {
-            layerParams.type = "ReLU";
-            replaceLayerParam(layerParams, "alpha", "negative_slope");
-        }
-        else if (layer_type == "Relu")
-        {
-            layerParams.type = "ReLU";
-        }
-        else if (layer_type == "Elu")
+    if (!steps.empty())
+        layerParams.set("steps", DictValue::arrayInt(&steps[0], steps.size()));
+
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        Mat inp = getBlob(node_proto, 0);
+        std::vector<Mat> inputs, sliced;
+        inputs.push_back(inp);
+        runLayer(layerParams, inputs, sliced);
+        CV_Assert(sliced.size() == 1);
+        addConstant(layerParams.name, sliced[0]);
+        return;
+    }
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseSplit(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    if (layerParams.has("split"))
+    {
+        DictValue splits = layerParams.get("split");
+        const int numSplits = splits.size();
+        CV_Assert(numSplits > 1);
+
+        std::vector<int> slicePoints(numSplits - 1, splits.get<int>(0));
+        for (int i = 1; i < splits.size() - 1; ++i)
         {
-            layerParams.type = "ELU";
+            slicePoints[i] = slicePoints[i - 1] + splits.get<int>(i);
         }
-        else if (layer_type == "Tanh")
-        {
-            layerParams.type = "TanH";
+        layerParams.set("slice_point", DictValue::arrayInt(&slicePoints[0], slicePoints.size()));
+    }
+    else
+    {
+        layerParams.set("num_split", node_proto.output_size());
+    }
+    layerParams.type = "Slice";
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseBias(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    const std::string& layer_type = node_proto.op_type();
+    bool isSub = layer_type == "Sub";
+    CV_CheckEQ(node_proto.input_size(), 2, "");
+    bool is_const_0 = layer_id.find(node_proto.input(0)) == layer_id.end();
+    bool is_const_1 = layer_id.find(node_proto.input(1)) == layer_id.end();
+    if (is_const_0 && is_const_1)
+    {
+        Mat blob_0 = getBlob(node_proto, 0);
+        Mat blob_1 = getBlob(node_proto, 1);
+        CV_Assert(blob_0.size == blob_1.size);
+        Mat output = isSub ? (blob_0 - blob_1) : (blob_0 + blob_1);
+        addConstant(layerParams.name, output);
+        return;
+    }
+    else if (is_const_0 || is_const_1)
+    {
+        int const_blob_id = is_const_0 ? 0 : 1;
+        Mat blob = getBlob(node_proto, const_blob_id);
+        int blob_total = blob.total();
+        if (blob_total == 1) {
+            layerParams.type = "Power";
+            layerParams.set("shift", (isSub ? -1 : 1) * blob.ptr<float>()[0]);
         }
-        else if (layer_type == "PRelu")
-        {
-            layerParams.type = "PReLU";
-            layerParams.blobs.push_back(getBlob(node_proto, 1));
+        else {
+            MatShape inpShape = outShapes[node_proto.input(1 - const_blob_id)];
+            if (shape(blob) == inpShape)
+            {
+                LayerParams constParams;
+                constParams.name = layerParams.name + "/const";
+                constParams.type = "Const";
+                constParams.blobs.push_back((isSub ? -1 : 1) * blob);
+                int id = dstNet.addLayer(constParams.name, constParams.type, constParams);
+                layer_id.insert(std::make_pair(constParams.name, LayerInfo(id, 0)));
+                outShapes[constParams.name] = shape(blob);
+
+                layerParams.type = "Eltwise";
+                node_proto.set_input(const_blob_id, constParams.name);
+            }
+            else
+            {
+                layerParams.type = "Scale";
+                layerParams.set("bias_term", true);
+                int axis = 1;
+                for (int i = 0; i < graph_proto.initializer_size(); i++)
+                {
+                    opencv_onnx::TensorProto tensor_proto = graph_proto.initializer(i);
+                    if (tensor_proto.name() == node_proto.input(const_blob_id))
+                    {
+                        axis = inpShape.size() - tensor_proto.dims_size();
+                        break;
+                    }
+                }
+                layerParams.set("axis", axis);
+                blob = blob.reshape(1, 1);
+                layerParams.blobs.push_back((isSub ? -1 : 1) * blob);
+            }
         }
-        else if (layer_type == "LRN")
+    }
+    else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
+    {
+        layerParams.type = "Eltwise";
+        if (isSub)
         {
-            replaceLayerParam(layerParams, "size", "local_size");
+            static float subCoeffs[] = {1.f, -1.f};
+            layerParams.set("coeff", DictValue::arrayReal<float*>(subCoeffs, 2));
         }
-        else if (layer_type == "InstanceNormalization")
+    }
+    else
+    {
+        if (isSub)
         {
-            if (node_proto.input_size() != 3)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected input, scale, bias");
-
-            layerParams.blobs.resize(4);
-            layerParams.blobs[2] = getBlob(node_proto, 1);  // weightData
-            layerParams.blobs[3] = getBlob(node_proto, 2);  // biasData
-            layerParams.set("has_bias", true);
-            layerParams.set("has_weight", true);
-
-            // Get number of channels in input
-            int size = layerParams.blobs[2].total();
-            layerParams.blobs[0] = Mat::zeros(size, 1, CV_32F); // mean
-            layerParams.blobs[1] = Mat::ones(size, 1, CV_32F); // std
-
-            LayerParams mvnParams;
-            mvnParams.name = layerParams.name + "/MVN";
-            mvnParams.type = "MVN";
-            mvnParams.set("eps", layerParams.get<float>("epsilon"));
-            layerParams.erase("epsilon");
-
-            //Create MVN layer
-            int id;
-            if (DNN_DIAGNOSTICS_RUN)
-                id = utilNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
-            else
-                id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
+            LayerParams powerParams;
+            powerParams.name = layerParams.name + "/neg";
+            powerParams.type = "Power";
+            powerParams.set("scale", -1);
+
+            //Create Power layer
+            int id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
             //Connect to input
-            IterLayerId_t layerId = layer_id.find(node_proto.input(0));
+            IterLayerId_t layerId = layer_id.find(node_proto.input(1));
             CV_Assert(layerId != layer_id.end());
-            if (DNN_DIAGNOSTICS_RUN)
-                utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-            else
-                dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
             //Add shape
-            layer_id.insert(std::make_pair(mvnParams.name, LayerInfo(id, 0)));
-            outShapes[mvnParams.name] = outShapes[node_proto.input(0)];
+            layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
+            outShapes[powerParams.name] = outShapes[node_proto.input(1)];
 
-            //Replace Batch Norm's input to MVN
-            node_proto.set_input(0, mvnParams.name);
-            layerParams.type = "BatchNorm";
+            //Replace input to Power
+            node_proto.set_input(1, powerParams.name);
         }
-        else if (layer_type == "BatchNormalization")
-        {
-            if (node_proto.input_size() != 5)
-                CV_Error(Error::StsNotImplemented,
-                         "Expected input, scale, bias, mean and var");
+        layerParams.type = "Scale";
+        layerParams.set("bias_term", true);
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            layerParams.type = "BatchNorm";
-            replaceLayerParam(layerParams, "epsilon", "eps");
-            replaceLayerParam(layerParams, "spatial", "use_global_stats");
+void ONNXImporter::parsePow(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    if (layer_id.find(node_proto.input(1)) != layer_id.end())
+        CV_Error(Error::StsNotImplemented, "Unsupported Pow op with variable power");
 
-            Mat meanData = getBlob(node_proto, 3);
-            Mat stdData =  getBlob(node_proto, 4);
+    Mat blob = getBlob(node_proto, 1);
+    if (blob.total() != 1)
+        CV_Error(Error::StsNotImplemented, "Pow op supports only scalar power");
 
-            layerParams.blobs.push_back(meanData);
-            layerParams.blobs.push_back(stdData);
+    blob.convertTo(blob, CV_32F);
+    layerParams.type = "Power";
+    layerParams.set("power", blob.ptr<float>()[0]);
+    addLayer(layerParams, node_proto);
+}
 
-            if (!node_proto.input(1).empty()) {
-                layerParams.set("has_weight", true);
-                layerParams.blobs.push_back(getBlob(node_proto, 1));  // weightData
-            } else {
-                layerParams.set("has_weight", false);
-            }
+void ONNXImporter::parseMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Eltwise";
+    layerParams.set("operation", "max");
+    addLayer(layerParams, node_proto);
+}
 
-            if (!node_proto.input(2).empty()) {
-                layerParams.set("has_bias", true);
-                layerParams.blobs.push_back(getBlob(node_proto, 2)); // biasData
-            } else {
-                layerParams.set("has_bias", false);
-            }
-        }
-        else if (layer_type == "Gemm")
-        {
-            CV_Assert(node_proto.input_size() >= 2);
-            layerParams.type = "InnerProduct";
-            Mat weights = getBlob(node_proto, 1);
-            int ind_num_out = 0;
-            if (layerParams.has("transB") && !layerParams.get<int>("transB")) {
-                transpose(weights, weights);
-                ind_num_out = 1;
-            }
-            layerParams.blobs.push_back(weights);
+void ONNXImporter::parseNeg(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Power";
+    layerParams.set("scale", -1);
+    addLayer(layerParams, node_proto);
+}
 
-            if (node_proto.input_size() == 3) {
-                Mat bias = getBlob(node_proto, 2);
-                layerParams.blobs.push_back(bias);
-            }
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                Mat inputBuf = getBlob(node_proto, 0);
+void ONNXImporter::parseConstant(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() == 0);
+    CV_Assert(layerParams.blobs.size() == 1);
+    addConstant(layerParams.name, layerParams.blobs[0]);
+}
 
-                LayerParams constParams;
-                constParams.name = node_proto.input(0);
-                constParams.type = "Const";
-                constParams.blobs.push_back(inputBuf);
+void ONNXImporter::parseLSTM(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    LayerParams lstmParams = layerParams;
+    lstmParams.name += "/lstm";
+
+    // https://pytorch.org/docs/stable/nn.html#lstm
+    CV_Assert(node_proto.input_size() >= 7);
+    Mat Wx = getBlob(node_proto, 1);
+    Mat Wh = getBlob(node_proto, 2);
+    Mat b = getBlob(node_proto, 3);
+
+    const int numHidden = lstmParams.get<int>("hidden_size");
+    const int numDirs = Wx.size[0];  // Is 1 for forward only and 2 for bidirectional LSTM.
+    const int numFeatures = Wx.size[2];
+
+    Mat h0, c0;
+    if (!node_proto.input(5).empty()) {
+        h0 = getBlob(node_proto, 5);
+        h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+    } else {
+        // initial_h attribute can be empty in case of keras2onnx producer. fill it with zeros
+        h0 = Mat::zeros(numDirs * numFeatures, numHidden, CV_32FC1);
+    }
+    if (!node_proto.input(6).empty()) {
+        c0 = getBlob(node_proto, 6);
+        c0 = c0.reshape(1, c0.size[0] * c0.size[1]);
+    } else {
+        // initial_c attribute can be empty in case of keras2onnx producer. fill it with zeros
+        c0 = Mat::zeros(numDirs * numFeatures, numHidden, CV_32FC1);
+    }
 
-                opencv_onnx::NodeProto proto;
-                proto.add_output(constParams.name);
-                addLayer(constParams, proto);
-            }
+    b = b.reshape(1, b.size[0]);
+    Mat bx = b.colRange(0, b.cols / 2);
+    Mat bh = b.colRange(b.cols / 2, b.cols);
+    b = bx + bh;
 
-            layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
-            layerParams.set("bias_term", node_proto.input_size() == 3);
-        }
-        else if (layer_type == "MatMul")
-        {
-            CV_Assert(node_proto.input_size() == 2);
-            layerParams.type = "InnerProduct";
-            layerParams.set("bias_term", false);
-            CV_Assert(constBlobs.find(node_proto.input(0)) == constBlobs.end());
-            int firstInpDims = outShapes[node_proto.input(0)].size();
-            int secondInpDims;
-
-            if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
-            {
-                Mat blob = getBlob(node_proto, 1);
-                secondInpDims = blob.dims;
-                layerParams.blobs.push_back(blob.t());
-                layerParams.set("num_output", layerParams.blobs[0].size[0]);
-            } else {
-                secondInpDims = outShapes[node_proto.input(1)].size();
-            }
-            layerParams.set("axis", firstInpDims - secondInpDims + 1);
-        }
-        else if (layer_type == "Mul" || layer_type == "Div")
+    // IFGO->IGFO
+    for (int k = 0; k < numDirs; ++k)
+    {
+        float* WxData = Wx.ptr<float>(k);
+        float* WhData = Wh.ptr<float>(k);
+        float* biasData = b.ptr<float>(k);
+        for (int j = 0; j < numHidden; ++j)
         {
-            CV_Assert(node_proto.input_size() == 2);
-
-            bool isDiv = layer_type == "Div";
-            int constId = -1;
-            bool haveVariables = false;
-            for (int i = 0; i < 2; ++i)
+            for (int i = 0; i < numFeatures; ++i)
             {
-                if (constBlobs.find(node_proto.input(i)) != constBlobs.end())
-                    constId = i;
-                else
-                    haveVariables = true;
+                std::swap(WxData[(numHidden + j) * numFeatures + i],
+                          WxData[(numHidden * 2 + j) * numFeatures + i]);
             }
-            if (constId != -1 && haveVariables)
+            for (int i = 0; i < numHidden; ++i)
             {
-                Mat blob = getBlob(node_proto, constId);
-                blob = blob.reshape(1, 1);
-                if (blob.total() == 1) {
-                    float coeff = isDiv ? 1.0 / blob.at<float>(0) : blob.at<float>(0);
-                    layerParams.set("scale", coeff);
-                    layerParams.type = "Power";
-                }
-                else {
-                    if (isDiv)
-                        divide(1.0, blob, blob);
-                    layerParams.blobs.push_back(blob);
-                    layerParams.type = "Scale";
-                }
+                std::swap(WhData[(numHidden + j) * numHidden + i],
+                          WhData[(numHidden * 2 + j) * numHidden + i]);
             }
-            else if (!haveVariables)
-            {
-                Mat inp0 = getBlob(node_proto, 0);
-                Mat inp1 = getBlob(node_proto, 1);
+            std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
+        }
+    }
+    Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+    Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
 
-                if (inp0.size != inp1.size && (inp0.total() != 1 || inp1.total() != 1))
-                    CV_Error_(Error::StsNotImplemented, ("Different shapes case is not supported with constant inputs: %s", layer_type.c_str()));
 
-                if (inp0.total() == 1 && inp1.total() == 1 && inp0.dims != inp1.dims)
-                {
-                    if (inp0.dims < inp1.dims)
-                    {
-                        inp0 = inp0.reshape(1, inp1.dims, inp1.size);
-                        inp0.dims = inp1.dims;
-                    }
-                    else
-                    {
-                        inp1 = inp1.reshape(1, inp0.dims, inp0.size);
-                        inp1.dims = inp0.dims;
-                    }
-                }
+    lstmParams.blobs.resize(5);
+    lstmParams.blobs[0] = Wh;
+    lstmParams.blobs[1] = Wx;
+    lstmParams.blobs[2] = b;
+    lstmParams.blobs[3] = h0;
+    lstmParams.blobs[4] = c0;
 
-                Mat out;
-                if (inp0.total() != inp1.total())
-                {
-                    if (inp0.total() == 1)
-                    {
-                        float coeff = isDiv ? 1.0 / inp0.at<float>(0) : inp0.at<float>(0);
-                        multiply(inp1, coeff, out);
-                    }
-                    else
-                    {
-                        float coeff = isDiv ? 1.0 / inp1.at<float>(0) : inp1.at<float>(0);
-                        multiply(inp0, coeff, out);
-                    }
+    // read direction attribute
+    lstmParams.set("reverse", lstmParams.get<String>("direction", "") == "reverse");
+    lstmParams.set("bidirectional", lstmParams.get<String>("direction", "") == "bidirectional");
 
-                }
-                else
-                {
-                    out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
-                }
+    node_proto.set_output(0, lstmParams.name);  // set different name so output shapes will be registered on that name
+    addLayer(lstmParams, node_proto);
 
-                if (inp0.dims == 1 && inp1.dims == 1)
-                    out.dims = 1;  // to workaround dims == 1
-                addConstant(layerParams.name, out);
-                return;
-            }
-            else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
-            {
-                layerParams.type = "Eltwise";
-                layerParams.set("operation", isDiv ? "div" : "prod");
-            }
-            else
-            {
-                // Scale layer allocate output with the first input shape
-                if (total(outShapes[node_proto.input(0)]) < total(outShapes[node_proto.input(1)]))
-                {
-                    opencv_onnx::NodeProto proto;
-                    proto.add_input(node_proto.input(1));
-                    proto.add_input(node_proto.input(0));
-                    proto.add_output(layerParams.name);
-                    node_proto = proto;
-                }
+    MatShape lstmShape = outShapes[node_proto.output(0)];
 
-                if (isDiv)
-                {
-                    LayerParams powerParams;
-                    powerParams.name = layerParams.name + "/inv";
-                    powerParams.type = "Power";
-                    powerParams.set("power", -1);
-
-                    int id;
-                    //Create Power layer
-                    if (DNN_DIAGNOSTICS_RUN)
-                        id = utilNet.addLayer(powerParams.name, powerParams.type, powerParams);
-                    else
-                        id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
-                    //Connect to input
-                    IterLayerId_t layerId = layer_id.find(node_proto.input(1));
-                    CV_Assert(layerId != layer_id.end());
-                    if (DNN_DIAGNOSTICS_RUN)
-                        utilNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-                    else
-                        dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
-                    //Add shape
-                    layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
-                    outShapes[powerParams.name] = outShapes[node_proto.input(1)];
-
-                    //Replace input to Power
-                    node_proto.set_input(1, powerParams.name);
-                }
-                layerParams.type = "Scale";
-            }
-        }
-        else if (layer_type == "Conv")
-        {
-            CV_Assert(node_proto.input_size() >= 2);
-            layerParams.type = "Convolution";
-            for (int j = 1; j < node_proto.input_size(); j++) {
-                if (constBlobs.find(node_proto.input(j)) != constBlobs.end())
-                {
-                    layerParams.blobs.push_back(getBlob(node_proto, j));
-                }
-            }
-            int outCn = layerParams.blobs.empty() ? outShapes[node_proto.input(1)][0] : layerParams.blobs[0].size[0];
-            layerParams.set("num_output", outCn);
-        }
-        else if (layer_type == "ConvTranspose")
-        {
-            CV_Assert(node_proto.input_size() >= 2);
-            layerParams.type = "Deconvolution";
-            for (int j = 1; j < node_proto.input_size(); j++) {
-                layerParams.blobs.push_back(getBlob(node_proto, j));
-            }
-            layerParams.set("num_output", layerParams.blobs[0].size[1] * layerParams.get<int>("group", 1));
-            layerParams.set("bias_term", node_proto.input_size() == 3);
+    // Add fake 1 as it is done in ONNX
+    lstmShape.insert(lstmShape.begin() + 1, 1);
 
-            if (!layerParams.has("kernel_size"))
-                CV_Error(Error::StsNotImplemented,
-                         "Required attribute 'kernel_size' is not present.");
+    layerParams.type = "Reshape";
+    layerParams.set("dim", DictValue::arrayInt(&lstmShape[0], lstmShape.size()));
+    node_proto.set_input(0, lstmParams.name);  // redirect input to LSTM
+    node_proto.set_output(0, layerParams.name);  // keep origin LSTM's name
+    addLayer(layerParams, node_proto);
+}
 
-            if (layerParams.has("output_shape"))
-            {
-                const DictValue& outShape = layerParams.get("output_shape");
-                DictValue strides = layerParams.get("stride");
-                DictValue kernel = layerParams.get("kernel_size");
+void ONNXImporter::parseGRU(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    LayerParams gruParams = layerParams;
+    gruParams.name += "/gru";
+
+    // https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=gru#
+    CV_Assert(node_proto.input_size() == 6);
+    Mat Wx = getBlob(node_proto, 1);
+    Mat Wh = getBlob(node_proto, 2);
+    Mat b = getBlob(node_proto, 3);
+    Mat h0 = getBlob(node_proto, 5);
+
+    Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+    Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
+    h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+    b = b.reshape(1, b.size[0]);
+
+    gruParams.blobs.resize(4);
+    gruParams.blobs[0] = Wh;
+    gruParams.blobs[1] = Wx;
+    gruParams.blobs[2] = b;
+    gruParams.blobs[3] = h0;
+    gruParams.set("bidirectional", gruParams.get<String>("direction", "") == "bidirectional");
+
+    node_proto.set_output(0, gruParams.name);  // set different name so output shapes will be registered on that name
+    addLayer(gruParams, node_proto);
+
+    MatShape gruShape = outShapes[node_proto.output(0)];
+
+    // Add fake 1 as it is done in ONNX
+    gruShape.insert(gruShape.begin() + 1, 1);
+
+    layerParams.type = "Reshape";
+    layerParams.set("dim", DictValue::arrayInt(&gruShape[0], gruShape.size()));
+    node_proto.set_input(0, gruParams.name);  // redirect input to GRU
+    node_proto.set_output(0, layerParams.name);  // keep origin GRU's name
+    addLayer(layerParams, node_proto);
+}
 
-                String padMode;
-                std::vector<int> adjust_pads;
-                if (layerParams.has("pad_mode"))
-                {
-                    padMode = toUpperCase(layerParams.get<String>("pad_mode"));
-                    if (padMode != "SAME" && padMode != "VALID")
-                        CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
+void ONNXImporter::parseImageScaler(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    const float scale = layerParams.has("scale") ? layerParams.get<float>("scale") : 1.0f;
+    layerParams.erase("scale");
 
-                    for (int i = 0; i < strides.size(); i++)
-                    {
-                        int sz = outShape.get<int>(2 + i);
-                        int stride = strides.get<int>(i);
-                        adjust_pads.push_back(padMode == "SAME"? (sz - 1) % stride :
-                                                                 (sz - kernel.get<int>(i)) % stride);
-                    }
-                    layerParams.set("adj", DictValue::arrayInt(&adjust_pads[0], adjust_pads.size()));
-                }
-            }
-            else if (layerParams.has("output_padding"))
-            {
-                replaceLayerParam(layerParams, "output_padding", "adj");
-            }
+    if (layerParams.has("bias"))
+    {
+        layerParams.type = "Scale";
+        layerParams.blobs.push_back(
+                Mat(Size(1,  layerParams.get("bias").size()), CV_32FC1, scale));
+
+        layerParams.set("bias_term", true);
+        Mat bias(1, layerParams.get("bias").size(), CV_32FC1);
+        for (int j = 0; j < bias.total(); j++) {
+            bias.at<float>(0, j) = layerParams.get("bias").getRealValue(j);
         }
-        else if (layer_type == "Transpose")
-        {
-            layerParams.type = "Permute";
-            replaceLayerParam(layerParams, "perm", "order");
+        layerParams.blobs.push_back(bias);
+        layerParams.erase("bias");
+    }
+    else {
+        layerParams.set("scale", scale);
+        layerParams.type = "Power";
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            CV_Assert(node_proto.input_size() == 1);
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                std::vector<Mat> inputs(1, getBlob(node_proto, 0)), transposed;
-                runLayer(layerParams, inputs, transposed);
-                CV_Assert(transposed.size() == 1);
-                addConstant(layerParams.name, transposed[0]);
-                return;
-            }
+void ONNXImporter::parseClip(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "ReLU6";
+    replaceLayerParam(layerParams, "min", "min_value");
+    replaceLayerParam(layerParams, "max", "max_value");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseLeakyRelu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "ReLU";
+    replaceLayerParam(layerParams, "alpha", "negative_slope");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseRelu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "ReLU";
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseElu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "ELU";
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseTanh(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "TanH";
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parsePRelu(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "PReLU";
+    layerParams.blobs.push_back(getBlob(node_proto, 1));
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseLRN(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    replaceLayerParam(layerParams, "size", "local_size");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseInstanceNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    if (node_proto.input_size() != 3)
+        CV_Error(Error::StsNotImplemented,
+                 "Expected input, scale, bias");
+
+    layerParams.blobs.resize(4);
+    layerParams.blobs[2] = getBlob(node_proto, 1);  // weightData
+    layerParams.blobs[3] = getBlob(node_proto, 2);  // biasData
+    layerParams.set("has_bias", true);
+    layerParams.set("has_weight", true);
+
+    // Get number of channels in input
+    int size = layerParams.blobs[2].total();
+    layerParams.blobs[0] = Mat::zeros(size, 1, CV_32F); // mean
+    layerParams.blobs[1] = Mat::ones(size, 1, CV_32F); // std
+
+    LayerParams mvnParams;
+    mvnParams.name = layerParams.name + "/MVN";
+    mvnParams.type = "MVN";
+    mvnParams.set("eps", layerParams.get<float>("epsilon"));
+    layerParams.erase("epsilon");
+
+    //Create MVN layer
+    int id = dstNet.addLayer(mvnParams.name, mvnParams.type, mvnParams);
+    //Connect to input
+    IterLayerId_t layerId = layer_id.find(node_proto.input(0));
+    CV_Assert(layerId != layer_id.end());
+    dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+    //Add shape
+    layer_id.insert(std::make_pair(mvnParams.name, LayerInfo(id, 0)));
+    outShapes[mvnParams.name] = outShapes[node_proto.input(0)];
+
+    //Replace Batch Norm's input to MVN
+    node_proto.set_input(0, mvnParams.name);
+    layerParams.type = "BatchNorm";
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseBatchNormalization(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    if (node_proto.input_size() != 5)
+        CV_Error(Error::StsNotImplemented,
+                 "Expected input, scale, bias, mean and var");
+
+    layerParams.type = "BatchNorm";
+    replaceLayerParam(layerParams, "epsilon", "eps");
+    replaceLayerParam(layerParams, "spatial", "use_global_stats");
+
+    Mat meanData = getBlob(node_proto, 3);
+    Mat stdData =  getBlob(node_proto, 4);
+
+    layerParams.blobs.push_back(meanData);
+    layerParams.blobs.push_back(stdData);
+
+    if (!node_proto.input(1).empty()) {
+        layerParams.set("has_weight", true);
+        layerParams.blobs.push_back(getBlob(node_proto, 1));  // weightData
+    } else {
+        layerParams.set("has_weight", false);
+    }
+
+    if (!node_proto.input(2).empty()) {
+        layerParams.set("has_bias", true);
+        layerParams.blobs.push_back(getBlob(node_proto, 2)); // biasData
+    } else {
+        layerParams.set("has_bias", false);
+    }
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseGemm(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() >= 2);
+    layerParams.type = "InnerProduct";
+    Mat weights = getBlob(node_proto, 1);
+    int ind_num_out = 0;
+    if (layerParams.has("transB") && !layerParams.get<int>("transB")) {
+        transpose(weights, weights);
+        ind_num_out = 1;
+    }
+    layerParams.blobs.push_back(weights);
+
+    if (node_proto.input_size() == 3) {
+        Mat bias = getBlob(node_proto, 2);
+        layerParams.blobs.push_back(bias);
+    }
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        Mat inputBuf = getBlob(node_proto, 0);
+
+        LayerParams constParams;
+        constParams.name = node_proto.input(0);
+        constParams.type = "Const";
+        constParams.blobs.push_back(inputBuf);
+
+        opencv_onnx::NodeProto proto;
+        proto.add_output(constParams.name);
+        addLayer(constParams, proto);
+    }
+
+    layerParams.set("num_output", layerParams.blobs[0].size[ind_num_out]);
+    layerParams.set("bias_term", node_proto.input_size() == 3);
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseMatMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() == 2);
+    layerParams.type = "InnerProduct";
+    layerParams.set("bias_term", false);
+    CV_Assert(constBlobs.find(node_proto.input(0)) == constBlobs.end());
+    int firstInpDims = outShapes[node_proto.input(0)].size();
+    int secondInpDims;
+
+    if (constBlobs.find(node_proto.input(1)) != constBlobs.end())
+    {
+        Mat blob = getBlob(node_proto, 1);
+        secondInpDims = blob.dims;
+        layerParams.blobs.push_back(blob.t());
+        layerParams.set("num_output", layerParams.blobs[0].size[0]);
+    } else {
+        secondInpDims = outShapes[node_proto.input(1)].size();
+    }
+    layerParams.set("axis", firstInpDims - secondInpDims + 1);
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseMul(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    const std::string& layer_type = node_proto.op_type();
+    CV_Assert(node_proto.input_size() == 2);
+
+    bool isDiv = layer_type == "Div";
+    int constId = -1;
+    bool haveVariables = false;
+    for (int i = 0; i < 2; ++i)
+    {
+        if (constBlobs.find(node_proto.input(i)) != constBlobs.end())
+            constId = i;
+        else
+            haveVariables = true;
+    }
+    if (constId != -1 && haveVariables)
+    {
+        Mat blob = getBlob(node_proto, constId);
+        blob = blob.reshape(1, 1);
+        if (blob.total() == 1) {
+            float blob_value = blob.ptr<float>()[0];
+            float coeff = isDiv ? 1.0 / blob_value : blob_value;
+            layerParams.set("scale", coeff);
+            layerParams.type = "Power";
         }
-        else if (layer_type == "Squeeze")
-        {
-            CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
-            DictValue axes_dict = layerParams.get("axes");
-            MatShape inpShape = outShapes[node_proto.input(0)];
+        else {
+            if (isDiv)
+                divide(1.0, blob, blob);
+            layerParams.blobs.push_back(blob);
+            layerParams.type = "Scale";
+        }
+    }
+    else if (!haveVariables)
+    {
+        Mat inp0 = getBlob(node_proto, 0);
+        Mat inp1 = getBlob(node_proto, 1);
+
+        if (inp0.size != inp1.size && (inp0.total() != 1 || inp1.total() != 1))
+            CV_Error_(Error::StsNotImplemented, ("Different shapes case is not supported with constant inputs: %s", layer_type.c_str()));
 
-            std::vector<bool> maskedAxes(inpShape.size(), false);
-            for (int i = 0; i < axes_dict.size(); ++i)
+        if (inp0.total() == 1 && inp1.total() == 1 && inp0.dims != inp1.dims)
+        {
+            if (inp0.dims < inp1.dims)
             {
-                int axis = axes_dict.getIntValue(i);
-                CV_CheckLE(axis, static_cast<int>(inpShape.size()), "Squeeze axis");
-                maskedAxes[axis] = inpShape[axis] == 1;
+                inp0 = inp0.reshape(1, inp1.dims, inp1.size);
+                inp0.dims = inp1.dims;
             }
-            MatShape outShape;
-            for (int i = 0; i < inpShape.size(); ++i)
+            else
             {
-                if (!maskedAxes[i])
-                    outShape.push_back(inpShape[i]);
+                inp1 = inp1.reshape(1, inp0.dims, inp0.size);
+                inp1.dims = inp0.dims;
             }
-            if (outShape.size() != inpShape.size())
+        }
+
+        Mat out;
+        if (inp0.total() != inp1.total())
+        {
+            if (inp0.total() == 1)
             {
-                layerParams.type = "Reshape";
-                layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
-                if (hasDynamicShapes)
-                {
-                    std::vector<int> dynamicAxes;
-                    std::vector<int> inputIndices;
-                    for (int index = 0; index < inpShape.size(); ++index)
-                    {
-                        if (!maskedAxes[index])
-                            inputIndices.push_back(index);
-                    }
-                    for (int index = 0; index < outShape.size(); ++index)
-                        dynamicAxes.push_back(index);
-                    layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
-                    layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
-                }
+                float inp0_value = inp0.ptr<float>()[0];
+                float coeff = isDiv ? 1.0 / inp0_value : inp0_value;
+                multiply(inp1, coeff, out);
             }
             else
-                layerParams.type = "Identity";
-
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
             {
-                Mat inp = getBlob(node_proto, 0);
-                Mat out = inp.reshape(1, outShape);
-                out.dims = outShape.size();  // to workaround dims == 1
-                addConstant(layerParams.name, out);
-                return;
+                float inp1_value = inp1.ptr<float>()[0];
+                float coeff = isDiv ? 1.0 / inp1_value : inp1_value;
+                multiply(inp0, coeff, out);
             }
+
         }
-        else if (layer_type == "Flatten")
+        else
         {
-            CV_CheckEQ(node_proto.input_size(), 1, "");
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                Mat input = getBlob(node_proto, 0);
-                int axis = normalize_axis(layerParams.get<int>("axis", 1), input.dims);
-
-                std::vector<int> out_size(&input.size[0], &input.size[0] + axis);
-                out_size.push_back(input.total(axis));
-                Mat output = input.reshape(1, out_size);
-                addConstant(layerParams.name, output);
-                return;
-            }
+            out = isDiv ? inp0 / inp1 : inp0.mul(inp1);
         }
-        else if (layer_type == "Unsqueeze")
+
+        if (inp0.dims == 1 && inp1.dims == 1)
+            out.dims = 1;  // to workaround dims == 1
+        addConstant(layerParams.name, out);
+        return;
+    }
+    else if (outShapes[node_proto.input(0)] == outShapes[node_proto.input(1)])
+    {
+        layerParams.type = "Eltwise";
+        layerParams.set("operation", isDiv ? "div" : "prod");
+    }
+    else
+    {
+        // Scale layer allocate output with the first input shape
+        if (total(outShapes[node_proto.input(0)]) < total(outShapes[node_proto.input(1)]))
         {
-            CV_Assert(node_proto.input_size() == 1);
-            DictValue axes = layerParams.get("axes");
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                // Constant input.
-                Mat input = getBlob(node_proto, 0);
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(1));
+            proto.add_input(node_proto.input(0));
+            proto.add_output(layerParams.name);
+            node_proto = proto;
+        }
 
-                std::vector<int> dims;
-                for (int j = 0; j < input.dims; j++) {
-                    dims.push_back(input.size[j]);
-                }
-                CV_Assert(axes.getIntValue(axes.size()-1) <= dims.size());
-                for (int j = 0; j < axes.size(); j++) {
-                    dims.insert(dims.begin() + axes.getIntValue(j), 1);
-                }
+        if (isDiv)
+        {
+            LayerParams powerParams;
+            powerParams.name = layerParams.name + "/inv";
+            powerParams.type = "Power";
+            powerParams.set("power", -1);
 
-                Mat out = input.reshape(0, dims);
-                addConstant(layerParams.name, out);
-                return;
-            }
+            //Create Power layer
+            int id = dstNet.addLayer(powerParams.name, powerParams.type, powerParams);
+            //Connect to input
+            IterLayerId_t layerId = layer_id.find(node_proto.input(1));
+            CV_Assert(layerId != layer_id.end());
+            dstNet.connect(layerId->second.layerId, layerId->second.outputId, id, 0);
+            //Add shape
+            layer_id.insert(std::make_pair(powerParams.name, LayerInfo(id, 0)));
+            outShapes[powerParams.name] = outShapes[node_proto.input(1)];
 
-            // Variable input.
-            if (axes.size() != 1)
-                CV_Error(Error::StsNotImplemented, "Multidimensional unsqueeze");
+            //Replace input to Power
+            node_proto.set_input(1, powerParams.name);
+        }
+        layerParams.type = "Scale";
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            MatShape inpShape = outShapes[node_proto.input(0)];
-            int axis = axes.getIntValue(0);
-            CV_Assert(0 <= axis && axis <= inpShape.size());
-            std::vector<int> outShape = inpShape;
-            outShape.insert(outShape.begin() + axis, 1);
-            layerParams.type = "Reshape";
-            layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
-            if (hasDynamicShapes)
+void ONNXImporter::parseConv(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    CV_Assert(node_proto.input_size() >= 2);
+    layerParams.type = "Convolution";
+    for (int j = 1; j < node_proto.input_size(); j++) {
+        if (constBlobs.find(node_proto.input(j)) != constBlobs.end())
+        {
+            layerParams.blobs.push_back(getBlob(node_proto, j));
+        }
+    }
+    int outCn = layerParams.blobs.empty() ? outShapes[node_proto.input(1)][0] : layerParams.blobs[0].size[0];
+    layerParams.set("num_output", outCn);
+
+    // Check for asymmetric padding in Conv2D
+    if (layerParams.has("pad"))
+    {
+        bool asymmetricPadding = false;
+        DictValue pads = layerParams.get("pad");
+        const int dims = pads.size() / 2;
+        for (int i = 0; i < dims; ++i)
+        {
+            if (pads.get<int>(i) != pads.get<int>(i + dims))
             {
-                std::vector<int> dynamicAxes;
-                std::vector<int> inputIndices;
-                for (int index = 0; index < outShape.size(); ++index) {
-                    if (index != axis)
-                        dynamicAxes.push_back(index);
-                }
-                for (int index = 0; index < inpShape.size(); ++index)
-                    inputIndices.push_back(index);
-                layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
-                layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
+                asymmetricPadding = true;
+                break;
             }
         }
-        else if (layer_type == "Expand")
+        if (asymmetricPadding && pads.size() == 4) // [pad_t, pad_l, pad_b, pad_r]
         {
-            CV_CheckEQ(node_proto.input_size(), 2, "");
-            const std::string& input0 = node_proto.input(0);
-            const std::string& input1 = node_proto.input(1);
-            Mat newShapeMat = getBlob(input1);
-            MatShape targetShape(newShapeMat.ptr<int>(), newShapeMat.ptr<int>() + newShapeMat.total());
-
-            MatShape inpShape;
-            bool haveVariables = constBlobs.find(input0) == constBlobs.end();
-            if (haveVariables)
-            {
-                IterShape_t shapeIt = outShapes.find(input0);
-                CV_Assert(shapeIt != outShapes.end());
-                inpShape = shapeIt->second;
-            }
-            else
-            {
-                inpShape = shape(getBlob(input0));
-            }
+            layerParams.erase("pad");
+            // No paddings required for N, C axis
+            std::vector<int> paddings(4, 0);
+            // Add paddings for H, W axis
+            for (int i = 0; i < dims; ++i)
+            {
+                paddings.push_back(pads.get<int>(i));
+                paddings.push_back(pads.get<int>(dims + i));
+            }
+            LayerParams padLp;
+            padLp.name = layerParams.name + "/pad";
+            padLp.type = "Padding";
+            padLp.set("paddings", DictValue::arrayInt(&paddings[0], paddings.size()));
+
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(0));
+            proto.add_output(padLp.name);
+
+            addLayer(padLp, proto);
+            node_proto.set_input(0, padLp.name);
+        }
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            String srcName = input0;
-            // Unsqueeze and repeat along new axis
-            if (targetShape.size() == inpShape.size() + 1)
-            {
-                for (int i = 0; i < targetShape.size(); i++)
-                {
-                    if (targetShape[i] == -1 && i < inpShape.size())
-                        targetShape[i] = inpShape[i];
-                    else if (i < inpShape.size() && targetShape[i] != inpShape[i])
-                        inpShape.insert(inpShape.begin() + i, 1);
-                }
-                if (haveVariables)
-                {
-                    LayerParams reshapeLp;
-                    reshapeLp.name = layerParams.name + "/reshape";
-                    reshapeLp.type = "Reshape";
-                    CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
-                    reshapeLp.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
-
-                    opencv_onnx::NodeProto proto;
-                    proto.add_input(node_proto.input(0));
-                    proto.add_output(reshapeLp.name);
-                    addLayer(reshapeLp, proto);
-                    srcName = reshapeLp.name;
-                }
-            }
-            CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
+void ONNXImporter::parseConvTranspose(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() >= 2);
+    layerParams.type = "Deconvolution";
+    for (int j = 1; j < node_proto.input_size(); j++) {
+        layerParams.blobs.push_back(getBlob(node_proto, j));
+    }
+    layerParams.set("num_output", layerParams.blobs[0].size[1] * layerParams.get<int>("group", 1));
+    layerParams.set("bias_term", node_proto.input_size() == 3);
 
-            std::vector<int> broadcast_axes;
-            for (int i = 0; i < targetShape.size(); i++)
-            {
-                if (targetShape[i] != inpShape[i])
-                {
-                    if (inpShape[i] == 1)
-                        broadcast_axes.push_back(i);
-                    else
-                        CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
-                }
-            }
+    if (!layerParams.has("kernel_size"))
+        CV_Error(Error::StsNotImplemented,
+                 "Required attribute 'kernel_size' is not present.");
 
-            if (!haveVariables)
+    if (layerParams.has("output_shape"))
+    {
+        const DictValue& outShape = layerParams.get("output_shape");
+        DictValue strides = layerParams.get("stride");
+        DictValue kernel = layerParams.get("kernel_size");
+
+        String padMode;
+        std::vector<int> adjust_pads;
+        if (layerParams.has("pad_mode"))
+        {
+            padMode = toUpperCase(layerParams.get<String>("pad_mode"));
+            if (padMode != "SAME" && padMode != "VALID")
+                CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
+
+            for (int i = 0; i < strides.size(); i++)
             {
-                if (broadcast_axes.size() != 1)
-                    CV_Error(Error::StsNotImplemented, "Expand op doesn't support multiple axes for constant input");
-
-                Mat input = getBlob(node_proto, 0);
-                input = input.reshape(0, total(inpShape, 0, broadcast_axes[0]));
-                Mat output = cv::repeat(input, 1, targetShape[broadcast_axes[0]]);
-                output = output.reshape(0, targetShape);
-                addConstant(layerParams.name, output);
-                return;
+                int sz = outShape.get<int>(2 + i);
+                int stride = strides.get<int>(i);
+                adjust_pads.push_back(padMode == "SAME"? (sz - 1) % stride :
+                                                         (sz - kernel.get<int>(i)) % stride);
             }
+            layerParams.set("adj", DictValue::arrayInt(&adjust_pads[0], adjust_pads.size()));
+        }
+    }
+    else if (layerParams.has("output_padding"))
+    {
+        replaceLayerParam(layerParams, "output_padding", "adj");
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            if (broadcast_axes.size() == 2 &&
-                broadcast_axes[0] == broadcast_axes[1] - 1 && broadcast_axes[1] == inpShape.size() - 1)
-            {
-                LayerParams constParams;
-                constParams.name = layerParams.name + "/const";
-                CV_Assert(layer_id.find(constParams.name) == layer_id.end());
-                constParams.type = "Const";
+void ONNXImporter::parseTranspose(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Permute";
+    replaceLayerParam(layerParams, "perm", "order");
 
-                Mat inp = Mat::ones(newShapeMat.total(), newShapeMat.ptr<int>(), CV_32F);
-                constParams.blobs.push_back(inp);
+    CV_Assert(node_proto.input_size() == 1);
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        std::vector<Mat> inputs(1, getBlob(node_proto, 0)), transposed;
+        runLayer(layerParams, inputs, transposed);
+        CV_Assert(transposed.size() == 1);
+        addConstant(layerParams.name, transposed[0]);
+        return;
+    }
+    addLayer(layerParams, node_proto);
+}
 
-                opencv_onnx::NodeProto proto;
-                proto.add_output(constParams.name);
-                addLayer(constParams, proto);
+void ONNXImporter::parseSqueeze(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert_N(node_proto.input_size() == 1, layerParams.has("axes"));
+    DictValue axes_dict = layerParams.get("axes");
+    MatShape inpShape = outShapes[node_proto.input(0)];
 
-                layerParams.type = "Scale";
-                layerParams.set("bias_term", false);
-                node_proto.set_input(0, constParams.name);
-                node_proto.set_input(1, srcName);
-            }
-            else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
+    std::vector<bool> maskedAxes(inpShape.size(), false);
+    for (int i = 0; i < axes_dict.size(); ++i)
+    {
+        int axis = axes_dict.getIntValue(i);
+        CV_CheckLE(axis, static_cast<int>(inpShape.size()), "Squeeze axis");
+        maskedAxes[axis] = inpShape[axis] == 1;
+    }
+    MatShape outShape;
+    for (int i = 0; i < inpShape.size(); ++i)
+    {
+        if (!maskedAxes[i])
+            outShape.push_back(inpShape[i]);
+    }
+    if (outShape.size() != inpShape.size())
+    {
+        layerParams.type = "Reshape";
+        layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
+        if (hasDynamicShapes)
+        {
+            std::vector<int> dynamicAxes;
+            std::vector<int> inputIndices;
+            for (int index = 0; index < inpShape.size(); ++index)
             {
-                String base_name = layerParams.name + "/copy_";
-                std::vector<std::string> input_names;
-                for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
-                {
-                    std::ostringstream ss;
-                    ss << j;
-                    LayerParams copyLP;
-                    copyLP.name = base_name + ss.str();
-                    copyLP.type = "Identity";
-                    CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
-                    input_names.push_back(copyLP.name);
-
-                    node_proto.set_input(0, srcName);
-                    node_proto.set_output(0, copyLP.name);
-                    addLayer(copyLP, node_proto);
-                }
-                node_proto.clear_input();
-                for (int i = 0; i < input_names.size(); i++)
-                {
-                    node_proto.add_input(input_names[i]);
-                }
-                layerParams.set("axis", broadcast_axes[0]);
-                layerParams.type = "Concat";
-                node_proto.set_output(0, layerParams.name);
+                if (!maskedAxes[index])
+                    inputIndices.push_back(index);
             }
-            else
-                CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
+            for (int index = 0; index < outShape.size(); ++index)
+                dynamicAxes.push_back(index);
+            layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
+            layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
         }
-        else if (layer_type == "Reshape")
-        {
-            CV_Assert(node_proto.input_size() == 2 || layerParams.has("shape"));
+    }
+    else
+        layerParams.type = "Identity";
 
-            if (node_proto.input_size() == 2) {
-                Mat blob = getBlob(node_proto, 1);
-                CV_Assert(blob.type() == CV_32SC1);
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        Mat inp = getBlob(node_proto, 0);
+        Mat out = inp.reshape(1, outShape);
+        out.dims = outShape.size();  // to workaround dims == 1
+        addConstant(layerParams.name, out);
+        return;
+    }
+    addLayer(layerParams, node_proto);
+}
 
-                layerParams.set("dim", DictValue::arrayInt<int*>(
-                            blob.ptr<int>(), blob.total() ));
+void ONNXImporter::parseFlatten(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_CheckEQ(node_proto.input_size(), 1, "");
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        Mat input = getBlob(node_proto, 0);
+        int axis = normalize_axis(layerParams.get<int>("axis", 1), input.dims);
+
+        std::vector<int> out_size(&input.size[0], &input.size[0] + axis);
+        out_size.push_back(input.total(axis));
+        Mat output = input.reshape(1, out_size);
+        addConstant(layerParams.name, output);
+        return;
+    }
+    addLayer(layerParams, node_proto);
+}
 
-                if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
-                    std::vector<Mat> inputs(1, getBlob(node_proto, 0)), outputs;
-                    runLayer(layerParams, inputs, outputs);
-                    addConstant(layerParams.name, outputs[0]);
-                    return;
-                }
-            }
-            else {
-                DictValue shape = layerParams.get("shape");
-                std::vector<int> dim;
-                for (int j = 0; j < shape.size(); j++) {
-                    dim.push_back(shape.getIntValue(j));
-                }
+void ONNXImporter::parseUnsqueeze(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() == 1);
+    DictValue axes = layerParams.get("axes");
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        // Constant input.
+        Mat input = getBlob(node_proto, 0);
 
-                if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
-                    Mat input = getBlob(node_proto, 0);
-                    Mat out = input.reshape(0, dim);
-                    addConstant(layerParams.name, out);
-                    return;
-                }
-                replaceLayerParam(layerParams, "shape", "dim");
-            }
+        std::vector<int> dims;
+        for (int j = 0; j < input.dims; j++) {
+            dims.push_back(input.size[j]);
+        }
+        CV_Assert(axes.getIntValue(axes.size()-1) <= dims.size());
+        for (int j = 0; j < axes.size(); j++) {
+            dims.insert(dims.begin() + axes.getIntValue(j), 1);
         }
-        else if (layer_type == "Pad")
-        {
-            layerParams.type = "Padding";
-            replaceLayerParam(layerParams, "mode", "type");
-            if (node_proto.input_size() == 3 || node_proto.input_size() == 2)
-            {
-                // Paddings are in order begin0, begin1, .. beginN, end0, end1, ..., endN.
-                // We need to shuffle it to begin0, end0, begin1, end1, ...
-                Mat paddings = getBlob(node_proto, 1).reshape(1, 2);
-                paddings = paddings.t();
-                layerParams.set("paddings", DictValue::arrayInt(paddings.ptr<int>(), paddings.total()));
 
-                if (node_proto.input_size() == 3)
-                {
-                    Mat value = getBlob(node_proto, 2);
-                    layerParams.set("value", value.at<float>(0));
-                }
-            }
+        Mat out = input.reshape(0, dims);
+        addConstant(layerParams.name, out);
+        return;
+    }
+
+    // Variable input.
+    if (axes.size() != 1)
+        CV_Error(Error::StsNotImplemented, "Multidimensional unsqueeze");
+
+    MatShape inpShape = outShapes[node_proto.input(0)];
+    int axis = axes.getIntValue(0);
+    CV_Assert(0 <= axis && axis <= inpShape.size());
+    std::vector<int> outShape = inpShape;
+    outShape.insert(outShape.begin() + axis, 1);
+    layerParams.type = "Reshape";
+    layerParams.set("dim", DictValue::arrayInt(&outShape[0], outShape.size()));
+    if (hasDynamicShapes)
+    {
+        std::vector<int> dynamicAxes;
+        std::vector<int> inputIndices;
+        for (int index = 0; index < outShape.size(); ++index) {
+            if (index != axis)
+                dynamicAxes.push_back(index);
         }
-        else if (layer_type == "Shape")
-        {
-            CV_Assert(node_proto.input_size() == 1);
-            IterShape_t shapeIt = outShapes.find(node_proto.input(0));
-            CV_Assert(shapeIt != outShapes.end());
-            const MatShape& inpShape = shapeIt->second;
+        for (int index = 0; index < inpShape.size(); ++index)
+            inputIndices.push_back(index);
+        layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
+        layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
+    }
+    addLayer(layerParams, node_proto);
+}
 
-            Mat shapeMat(inpShape.size(), 1, CV_32S);
-            for (int j = 0; j < inpShape.size(); ++j)
-                shapeMat.at<int>(j) = inpShape[j];
-            shapeMat.dims = 1;
+void ONNXImporter::parseExpand(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    CV_CheckEQ(node_proto.input_size(), 2, "");
+    const std::string& input0 = node_proto.input(0);
+    const std::string& input1 = node_proto.input(1);
+    Mat newShapeMat = getBlob(input1);
+    MatShape targetShape(newShapeMat.ptr<int>(), newShapeMat.ptr<int>() + newShapeMat.total());
+
+    MatShape inpShape;
+    bool haveVariables = constBlobs.find(input0) == constBlobs.end();
+    if (haveVariables)
+    {
+        IterShape_t shapeIt = outShapes.find(input0);
+        CV_Assert(shapeIt != outShapes.end());
+        inpShape = shapeIt->second;
+    }
+    else
+    {
+        inpShape = shape(getBlob(input0));
+    }
 
-            addConstant(layerParams.name, shapeMat);
-            return;
+    String srcName = input0;
+    // Unsqueeze and repeat along new axis
+    if (targetShape.size() == inpShape.size() + 1)
+    {
+        for (int i = 0; i < targetShape.size(); i++)
+        {
+            if (targetShape[i] == -1 && i < inpShape.size())
+                targetShape[i] = inpShape[i];
+            else if (i < inpShape.size() && targetShape[i] != inpShape[i])
+                inpShape.insert(inpShape.begin() + i, 1);
         }
-        else if (layer_type == "Cast")
+        if (haveVariables)
         {
-            if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
-            {
-                Mat blob = getBlob(node_proto, 0);
-                int type;
-                switch (layerParams.get<int>("to"))
-                {
-                    case opencv_onnx::TensorProto_DataType_FLOAT:   type = CV_32F; break;
-                    case opencv_onnx::TensorProto_DataType_UINT8:   type = CV_8U; break;
-                    case opencv_onnx::TensorProto_DataType_UINT16:  type = CV_16U; break;
-                    case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16S; break;
-                    case opencv_onnx::TensorProto_DataType_INT8:
-                    case opencv_onnx::TensorProto_DataType_INT16:
-                    case opencv_onnx::TensorProto_DataType_INT32:
-                    case opencv_onnx::TensorProto_DataType_INT64:   type = CV_32S; break;
-                    default: type = blob.type();
-                }
-                Mat dst;
-                blob.convertTo(dst, type);
-                dst.dims = blob.dims;
-                addConstant(layerParams.name, dst);
-                return;
-            }
-            else
-                layerParams.type = "Identity";
+            LayerParams reshapeLp;
+            reshapeLp.name = layerParams.name + "/reshape";
+            reshapeLp.type = "Reshape";
+            CV_Assert(layer_id.find(reshapeLp.name) == layer_id.end());
+            reshapeLp.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
+
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(0));
+            proto.add_output(reshapeLp.name);
+            addLayer(reshapeLp, proto);
+            srcName = reshapeLp.name;
         }
-        else if (layer_type == "ConstantOfShape" || layer_type == "ConstantFill")
+    }
+    CV_CheckEQ(inpShape.size(), targetShape.size(), "Unsupported Expand op with different dims");
+
+    std::vector<int> broadcast_axes;
+    for (int i = 0; i < targetShape.size(); i++)
+    {
+        if (targetShape[i] != inpShape[i])
         {
-            int depth = CV_32F;
-            float fill_value;
-            if (!layerParams.blobs.empty())
-            {
-                CV_Assert(!layerParams.has("value"));
-                depth = layerParams.blobs[0].depth();
-                Mat floats;
-                layerParams.blobs[0].convertTo(floats, CV_32F);
-                fill_value = floats.at<float>(0, 0);
-            }
+            if (inpShape[i] == 1)
+                broadcast_axes.push_back(i);
             else
-                fill_value = layerParams.get("value", 0);
+                CV_Error(Error::StsError, format("Could not be broadcast by axis: %d", i));
+        }
+    }
 
-            MatShape inpShape = getBlob(node_proto, 0);
-            for (int i = 0; i < inpShape.size(); i++)
-                CV_CheckGT(inpShape[i], 0, "");
-            Mat tensor(inpShape.size(), &inpShape[0], depth, Scalar(fill_value));
-            addConstant(layerParams.name, tensor);
-            return;
+    if (!haveVariables)
+    {
+        if (broadcast_axes.size() != 1)
+            CV_Error(Error::StsNotImplemented, "Expand op doesn't support multiple axes for constant input");
+
+        Mat input = getBlob(node_proto, 0);
+        input = input.reshape(0, total(inpShape, 0, broadcast_axes[0]));
+        Mat output = cv::repeat(input, 1, targetShape[broadcast_axes[0]]);
+        output = output.reshape(0, targetShape);
+        addConstant(layerParams.name, output);
+        return;
+    }
+
+    if (broadcast_axes.size() == 2 &&
+        broadcast_axes[0] == broadcast_axes[1] - 1 && broadcast_axes[1] == inpShape.size() - 1)
+    {
+        LayerParams constParams;
+        constParams.name = layerParams.name + "/const";
+        CV_Assert(layer_id.find(constParams.name) == layer_id.end());
+        constParams.type = "Const";
+
+        Mat inp = Mat::ones(newShapeMat.total(), newShapeMat.ptr<int>(), CV_32F);
+        constParams.blobs.push_back(inp);
+
+        opencv_onnx::NodeProto proto;
+        proto.add_output(constParams.name);
+        addLayer(constParams, proto);
+
+        layerParams.type = "Scale";
+        layerParams.set("bias_term", false);
+        node_proto.set_input(0, constParams.name);
+        node_proto.set_input(1, srcName);
+    }
+    else if (broadcast_axes.size() == 1 && broadcast_axes[0] <= 1)
+    {
+        String base_name = layerParams.name + "/copy_";
+        std::vector<std::string> input_names;
+        for (int j = 0; j < targetShape[broadcast_axes[0]]; j++)
+        {
+            std::ostringstream ss;
+            ss << j;
+            LayerParams copyLP;
+            copyLP.name = base_name + ss.str();
+            copyLP.type = "Identity";
+            CV_Assert(layer_id.find(copyLP.name) == layer_id.end());
+            input_names.push_back(copyLP.name);
+
+            node_proto.set_input(0, srcName);
+            node_proto.set_output(0, copyLP.name);
+            addLayer(copyLP, node_proto);
         }
-        else if (layer_type == "Gather")
+        node_proto.clear_input();
+        for (int i = 0; i < input_names.size(); i++)
         {
-            CV_Assert(node_proto.input_size() == 2);
-            Mat indexMat = getBlob(node_proto, 1);
-            CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1);
-            int index = indexMat.at<int>(0);
-            int axis = layerParams.get<int>("axis", 0);
+            node_proto.add_input(input_names[i]);
+        }
+        layerParams.set("axis", broadcast_axes[0]);
+        layerParams.type = "Concat";
+        node_proto.set_output(0, layerParams.name);
+    }
+    else
+        CV_Error(Error::StsNotImplemented, "Unsupported Expand op");
+    addLayer(layerParams, node_proto);
+}
 
-            if ((constBlobs.find(node_proto.input(0)) != constBlobs.end()))
-            {
-                Mat input = getBlob(node_proto, 0);
-                Mat out;
-                std::vector<cv::Range> ranges(input.dims, Range::all());
-                ranges[axis] = Range(index, index + 1);
-
-                out = input(ranges);
-                MatShape outShape = shape(out);
-                if (outShape.size() > 1)
-                {
-                    outShape.erase(outShape.begin() + axis);
-                    out.reshape(0, outShape);
-                } else {
-                    out.dims = 1;
-                }
-                addConstant(layerParams.name, out);
-                return;
-            }
-            else
-            {
-                IterShape_t shapeIt = outShapes.find(node_proto.input(0));
-                CV_Assert(shapeIt != outShapes.end());
-                MatShape inpShape = shapeIt->second;
-
-                LayerParams sliceLp;
-                sliceLp.type = "Slice";
-                sliceLp.name = inpShape.size() > 1 ? layerParams.name + "/slice" : layerParams.name;
-                std::vector<int> begin(inpShape.size(), 0);
-                std::vector<int> end(inpShape.size(), -1);
-                begin[axis] = index;
-                end[axis] = index + 1;
-
-                cv::dnn::DictValue paramBegin = cv::dnn::DictValue::arrayInt(begin.data(), begin.size());
-                cv::dnn::DictValue paramEnd = cv::dnn::DictValue::arrayInt(end.data(), end.size());
-                sliceLp.set("begin", paramBegin);
-                sliceLp.set("end", paramEnd);
-                sliceLp.set("has_dynamic_shapes", hasDynamicShapes);
-
-                if (inpShape.size() > 1)
-                {
-                    opencv_onnx::NodeProto proto;
-                    proto.add_input(node_proto.input(0));
-                    proto.add_output(sliceLp.name);
-                    addLayer(sliceLp, proto);
-
-                    inpShape.erase(inpShape.begin() + axis);
-                    layerParams.type = "Reshape";
-                    layerParams.set("axis", 0);
-                    layerParams.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
-                    if (hasDynamicShapes)
-                    {
-                        std::vector<int> dynamicAxes;
-                        std::vector<int> inputIndices;
-                        for (int index = 0; index < inpShape.size(); ++index)
-                            dynamicAxes.push_back(index);
-                        for (int index = 0; index < inpShape.size(); ++index)
-                            inputIndices.push_back(index);
-                        layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
-                        layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
-                    }
-                    node_proto.set_input(0, sliceLp.name);
-                }
-                else
-                {
-                    layerParams = sliceLp;
-                }
-            }
+void ONNXImporter::parseReshape(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() == 2 || layerParams.has("shape"));
+
+    if (node_proto.input_size() == 2) {
+        Mat blob = getBlob(node_proto, 1);
+        CV_Assert(blob.type() == CV_32SC1);
+
+        layerParams.set("dim", DictValue::arrayInt<int*>(blob.ptr<int>(), blob.total()));
+
+        if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
+            std::vector<Mat> inputs(1, getBlob(node_proto, 0)), outputs;
+            runLayer(layerParams, inputs, outputs);
+            addConstant(layerParams.name, outputs[0]);
+            return;
+        }
+    }
+    else {
+        DictValue shape = layerParams.get("shape");
+        std::vector<int> dim;
+        for (int j = 0; j < shape.size(); j++) {
+            dim.push_back(shape.getIntValue(j));
         }
-        else if (layer_type == "Concat")
-        {
-            bool hasVariableInps = false;
-            for (int i = 0; i < node_proto.input_size(); ++i)
-            {
-                if (layer_id.find(node_proto.input(i)) != layer_id.end())
-                {
-                    hasVariableInps = true;
-                    break;
-                }
-            }
 
-            if (!hasVariableInps)
-            {
-                std::vector<Mat> inputs(node_proto.input_size()), concatenated;
-                // Due constant folding we can get inputs with different number of dimensions
-                // Insert the missing dimension to inputs
-                MatShape inputShape;
-                for (size_t i = 0; i < inputs.size(); ++i)
-                {
-                    inputs[i] = getBlob(node_proto, i);
-                    if (inputs[i].size.dims() > inputShape.size())
-                    {
-                        inputShape = shape(inputs[i]);
-                    }
-                }
+        if (layer_id.find(node_proto.input(0)) == layer_id.end()) {
+            Mat input = getBlob(node_proto, 0);
+            Mat out = input.reshape(0, dim);
+            addConstant(layerParams.name, out);
+            return;
+        }
+        replaceLayerParam(layerParams, "shape", "dim");
+    }
+    addLayer(layerParams, node_proto);
+}
 
-                // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
-                int axis = layerParams.get<int>("axis", 1);
-                for (size_t i = 0; i < inputs.size(); ++i)
-                {
-                    MatShape targetShape = inputShape;
-                    targetShape[axis] = shape(inputs[i])[axis];
-                    CV_CheckEQ(total(targetShape), total(shape(inputs[i])), "");
-                    inputs[i] = inputs[i].reshape(0, targetShape);
-                }
-                runLayer(layerParams, inputs, concatenated);
+void ONNXImporter::parsePad(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "Padding";
+    replaceLayerParam(layerParams, "mode", "type");
+    if (node_proto.input_size() == 3 || node_proto.input_size() == 2)
+    {
+        // Paddings are in order begin0, begin1, .. beginN, end0, end1, ..., endN.
+        // We need to shuffle it to begin0, end0, begin1, end1, ...
+        Mat paddings = getBlob(node_proto, 1).reshape(1, 2);
+        paddings = paddings.t();
+        layerParams.set("paddings", DictValue::arrayInt(paddings.ptr<int>(), paddings.total()));
 
-                CV_Assert(concatenated.size() == 1);
-                addConstant(layerParams.name, concatenated[0]);
-                return;
-            }
+        if (node_proto.input_size() == 3)
+        {
+            Mat value = getBlob(node_proto, 2);
+            layerParams.set("value", value.ptr<float>()[0]);
         }
-        else if (layer_type == "Resize")
+    }
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseShape(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    CV_Assert(node_proto.input_size() == 1);
+    IterShape_t shapeIt = outShapes.find(node_proto.input(0));
+    CV_Assert(shapeIt != outShapes.end());
+    const MatShape& inpShape = shapeIt->second;
+
+    Mat shapeMat(inpShape.size(), 1, CV_32S);
+    for (int j = 0; j < inpShape.size(); ++j)
+        shapeMat.at<int>(j) = inpShape[j];
+    shapeMat.dims = 1;
+
+    addConstant(layerParams.name, shapeMat);
+}
+
+void ONNXImporter::parseCast(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    if (constBlobs.find(node_proto.input(0)) != constBlobs.end())
+    {
+        Mat blob = getBlob(node_proto, 0);
+        int type;
+        switch (layerParams.get<int>("to"))
         {
-            for (int i = 1; i < node_proto.input_size(); i++)
-                CV_Assert(layer_id.find(node_proto.input(i)) == layer_id.end());
+            case opencv_onnx::TensorProto_DataType_FLOAT:   type = CV_32F; break;
+            case opencv_onnx::TensorProto_DataType_UINT8:   type = CV_8U; break;
+            case opencv_onnx::TensorProto_DataType_UINT16:  type = CV_16U; break;
+            case opencv_onnx::TensorProto_DataType_FLOAT16: type = CV_16S; break;
+            case opencv_onnx::TensorProto_DataType_INT8:
+            case opencv_onnx::TensorProto_DataType_INT16:
+            case opencv_onnx::TensorProto_DataType_INT32:
+            case opencv_onnx::TensorProto_DataType_INT64:   type = CV_32S; break;
+            default: type = blob.type();
+        }
+        Mat dst;
+        blob.convertTo(dst, type);
+        dst.dims = blob.dims;
+        addConstant(layerParams.name, dst);
+        return;
+    }
+    else
+        layerParams.type = "Identity";
+    addLayer(layerParams, node_proto);
+}
 
-            if (layerParams.has("coordinate_transformation_mode"))
-            {
-                String interp_mode = layerParams.get<String>("coordinate_transformation_mode");
-                CV_Assert_N(interp_mode != "tf_crop_and_resize", interp_mode != "tf_half_pixel_for_nn");
+void ONNXImporter::parseConstantFill(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    int depth = CV_32F;
+    float fill_value;
+    if (!layerParams.blobs.empty())
+    {
+        CV_Assert(!layerParams.has("value"));
+        depth = layerParams.blobs[0].depth();
+        Mat floats;
+        layerParams.blobs[0].convertTo(floats, CV_32F);
+        fill_value = floats.at<float>(0, 0);
+    }
+    else
+        fill_value = layerParams.get("value", 0);
 
-                layerParams.set("align_corners", interp_mode == "align_corners");
-                if (layerParams.get<String>("mode") == "linear")
-                {
-                    layerParams.set("mode", interp_mode == "pytorch_half_pixel" ?
-                                            "opencv_linear" : "bilinear");
-                }
-            }
-            if (layerParams.get<String>("mode") == "linear" && framework_name == "pytorch")
-                layerParams.set("mode", "opencv_linear");
+    MatShape inpShape = getBlob(node_proto, 0);
+    for (int i = 0; i < inpShape.size(); i++)
+        CV_CheckGT(inpShape[i], 0, "");
+    Mat tensor(inpShape.size(), &inpShape[0], depth, Scalar(fill_value));
+    addConstant(layerParams.name, tensor);
+}
 
-            // input = [X, scales], [X, roi, scales] or [x, roi, scales, sizes]
-            int foundScaleId = hasDynamicShapes ? node_proto.input_size() - 1
-                                                : node_proto.input_size() > 2 ? 2 : 1;
+void ONNXImporter::parseGather(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    CV_Assert(node_proto.input_size() == 2);
+    Mat indexMat = getBlob(node_proto, 1);
+    CV_Assert_N(indexMat.type() == CV_32S, indexMat.total() == 1);
+    int index = indexMat.at<int>(0);
+    int axis = layerParams.get<int>("axis", 0);
+
+    if ((constBlobs.find(node_proto.input(0)) != constBlobs.end()))
+    {
+        Mat input = getBlob(node_proto, 0);
+        Mat out;
+        std::vector<cv::Range> ranges(input.dims, Range::all());
+        ranges[axis] = Range(index, index + 1);
+
+        out = input(ranges);
+        MatShape outShape = shape(out);
+        if (outShape.size() > 1)
+        {
+            outShape.erase(outShape.begin() + axis);
+            out.reshape(0, outShape);
+        } else {
+            out.dims = 1;
+        }
+        addConstant(layerParams.name, out);
+        return;
+    }
+    else
+    {
+        IterShape_t shapeIt = outShapes.find(node_proto.input(0));
+        CV_Assert(shapeIt != outShapes.end());
+        MatShape inpShape = shapeIt->second;
+
+        LayerParams sliceLp;
+        sliceLp.type = "Slice";
+        sliceLp.name = inpShape.size() > 1 ? layerParams.name + "/slice" : layerParams.name;
+        std::vector<int> begin(inpShape.size(), 0);
+        std::vector<int> end(inpShape.size(), -1);
+        begin[axis] = index;
+        end[axis] = index + 1;
+
+        cv::dnn::DictValue paramBegin = cv::dnn::DictValue::arrayInt(begin.data(), begin.size());
+        cv::dnn::DictValue paramEnd = cv::dnn::DictValue::arrayInt(end.data(), end.size());
+        sliceLp.set("begin", paramBegin);
+        sliceLp.set("end", paramEnd);
+        sliceLp.set("has_dynamic_shapes", hasDynamicShapes);
+
+        if (inpShape.size() > 1)
+        {
+            opencv_onnx::NodeProto proto;
+            proto.add_input(node_proto.input(0));
+            proto.add_output(sliceLp.name);
+            addLayer(sliceLp, proto);
 
-            Mat scales = getBlob(node_proto, foundScaleId);
-            if (scales.total() == 4)
-            {
-                layerParams.set("zoom_factor_y", scales.at<float>(2));
-                layerParams.set("zoom_factor_x", scales.at<float>(3));
-            }
-            else
+            inpShape.erase(inpShape.begin() + axis);
+            layerParams.type = "Reshape";
+            layerParams.set("axis", 0);
+            layerParams.set("dim", DictValue::arrayInt(&inpShape[0], inpShape.size()));
+            if (hasDynamicShapes)
             {
-                const std::string& inputLast = node_proto.input(node_proto.input_size() - 1);
-                if (constBlobs.find(inputLast) != constBlobs.end())
-                {
-                    Mat shapes = getBlob(inputLast);
-                    CV_CheckEQ(shapes.size[0], 4, "");
-                    CV_CheckEQ(shapes.size[1], 1, "");
-                    CV_CheckDepth(shapes.depth(), shapes.depth() == CV_32S || shapes.depth() == CV_32F, "");
-                    if (shapes.depth() == CV_32F)
-                        shapes.convertTo(shapes, CV_32S);
-                    layerParams.set("width", shapes.at<int>(3));
-                    layerParams.set("height", shapes.at<int>(2));
-                }
+                std::vector<int> dynamicAxes;
+                std::vector<int> inputIndices;
+                for (int index = 0; index < inpShape.size(); ++index)
+                    dynamicAxes.push_back(index);
+                for (int index = 0; index < inpShape.size(); ++index)
+                    inputIndices.push_back(index);
+                layerParams.set("dynamic_axes", DictValue::arrayInt(dynamicAxes.data(), dynamicAxes.size()));
+                layerParams.set("input_indices", DictValue::arrayInt(inputIndices.data(), inputIndices.size()));
             }
-            replaceLayerParam(layerParams, "mode", "interpolation");
+            node_proto.set_input(0, sliceLp.name);
         }
-        else if (layer_type == "Upsample")
+        else
         {
-            //fused from Resize Subgraph
-            if (layerParams.has("coordinate_transformation_mode"))
-            {
-                String interp_mode = layerParams.get<String>("coordinate_transformation_mode");
-                CV_Assert_N(interp_mode != "tf_crop_and_resize", interp_mode != "tf_half_pixel_for_nn");
+            layerParams = sliceLp;
+        }
+    }
+    addLayer(layerParams, node_proto);
+}
 
-                layerParams.set("align_corners", interp_mode == "align_corners");
-                if (layerParams.get<String>("mode") == "linear")
-                {
-                    layerParams.set("mode", interp_mode == "pytorch_half_pixel" ?
-                                            "opencv_linear" : "bilinear");
-                }
-            }
-            if (layerParams.get<String>("mode") == "linear" && framework_name == "pytorch")
-                layerParams.set("mode", "opencv_linear");
+void ONNXImporter::parseConcat(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    bool hasVariableInps = false;
+    for (int i = 0; i < node_proto.input_size(); ++i)
+    {
+        if (layer_id.find(node_proto.input(i)) != layer_id.end())
+        {
+            hasVariableInps = true;
+            break;
+        }
+    }
 
-            layerParams.type = "Resize";
-            if (layerParams.has("scales"))
-            {
-                // Pytorch layer
-                DictValue scales = layerParams.get("scales");
-                CV_Assert(scales.size() == 4);
-                layerParams.set("zoom_factor_y", scales.getIntValue(2));
-                layerParams.set("zoom_factor_x", scales.getIntValue(3));
-            }
-            else if (layerParams.has("height_scale") && layerParams.has("width_scale"))
-            {
-                // Caffe2 layer
-                replaceLayerParam(layerParams, "height_scale", "zoom_factor_y");
-                replaceLayerParam(layerParams, "width_scale", "zoom_factor_x");
-            }
-            else
+    if (!hasVariableInps)
+    {
+        std::vector<Mat> inputs(node_proto.input_size()), concatenated;
+        // Due constant folding we can get inputs with different number of dimensions
+        // Insert the missing dimension to inputs
+        MatShape inputShape;
+        for (size_t i = 0; i < inputs.size(); ++i)
+        {
+            inputs[i] = getBlob(node_proto, i);
+            if (inputs[i].size.dims() > inputShape.size())
             {
-                // scales as input
-                const std::string& input1 = node_proto.input(1);
-                if (constBlobs.find(input1) != constBlobs.end())
-                {
-                    Mat scales = getBlob(input1);
-                    CV_Assert(scales.total() == 4);
-                    layerParams.set("zoom_factor_y", scales.at<float>(2));
-                    layerParams.set("zoom_factor_x", scales.at<float>(3));
-                }
+                inputShape = shape(inputs[i]);
             }
-            replaceLayerParam(layerParams, "mode", "interpolation");
         }
-        else if (layer_type == "SoftMax" || layer_type == "LogSoftmax")
+
+        // Concat-1 has default value for axis is 1: https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Concat-1
+        int axis = layerParams.get<int>("axis", 1);
+        for (size_t i = 0; i < inputs.size(); ++i)
         {
-            layerParams.type = "Softmax";
-            layerParams.set("log_softmax", layer_type == "LogSoftmax");
+            MatShape targetShape = inputShape;
+            targetShape[axis] = shape(inputs[i])[axis];
+            CV_CheckEQ(total(targetShape), total(shape(inputs[i])), "");
+            inputs[i] = inputs[i].reshape(0, targetShape);
         }
-        else if (layer_type == "DetectionOutput")
+        runLayer(layerParams, inputs, concatenated);
+
+        CV_Assert(concatenated.size() == 1);
+        addConstant(layerParams.name, concatenated[0]);
+        return;
+    }
+    else
+    {
+        for (int i = 0; i < node_proto.input_size(); ++i)
         {
-            CV_CheckEQ(node_proto.input_size(), 3, "");
-            if (constBlobs.find(node_proto.input(2)) != constBlobs.end())
+            if (constBlobs.find(node_proto.input(i)) != constBlobs.end())
             {
-                Mat priors = getBlob(node_proto, 2);
-
                 LayerParams constParams;
-                constParams.name = layerParams.name + "/priors";
+                constParams.name = node_proto.input(i);
                 constParams.type = "Const";
-                constParams.blobs.push_back(priors);
+                constParams.blobs.push_back(getBlob(node_proto, i));
 
-                opencv_onnx::NodeProto priorsProto;
-                priorsProto.add_output(constParams.name);
-                addLayer(constParams, priorsProto);
-
-                node_proto.set_input(2, constParams.name);
-            }
-        }
-        else
-        {
-            for (int j = 0; j < node_proto.input_size(); j++) {
-                if (layer_id.find(node_proto.input(j)) == layer_id.end())
-                    layerParams.blobs.push_back(getBlob(node_proto, j));
+                opencv_onnx::NodeProto proto;
+                proto.add_output(constParams.name);
+                addLayer(constParams, proto);
             }
         }
-        addLayer(layerParams, node_proto);
     }
-    catch (const cv::Exception& e)
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseResize(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    for (int i = 1; i < node_proto.input_size(); i++)
+        CV_Assert(layer_id.find(node_proto.input(i)) == layer_id.end());
+
+    if (layerParams.has("coordinate_transformation_mode"))
     {
-        if (DNN_DIAGNOSTICS_RUN)
-        {
-            CV_LOG_ERROR(NULL, "DNN/ONNX: Potential problem during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
-                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str()) << "\n" << e.msg
-            );
-            auto registeredLayers = getLayerFactoryImpl();
-            if (registeredLayers.find(layerParams.type) != registeredLayers.end())
-            {
-                try
-                {
-                    Ptr<Layer> layer = LayerFactory::createLayerInstance(layerParams.type, layerParams);
-                }
-                catch (const std::exception& e)
-                {
-                    CV_LOG_ERROR(NULL, "DNN/ONNX: Layer of type " << layerParams.type << "(" << layer_type << ") cannot be created with parameters " << layerParams << ". Error: " << e.what()
-                    );
-                }
-            }
-        }
-        else
+        String interp_mode = layerParams.get<String>("coordinate_transformation_mode");
+        CV_Assert_N(interp_mode != "tf_crop_and_resize", interp_mode != "tf_half_pixel_for_nn");
+
+        layerParams.set("align_corners", interp_mode == "align_corners");
+        if (layerParams.get<String>("mode") == "linear")
         {
-            CV_LOG_ERROR(NULL, "DNN/ONNX: ERROR during processing node with " << node_proto.input_size() << " inputs and " << node_proto.output_size() << " outputs: "
-                    << cv::format("[%s]:(%s)", layer_type.c_str(), name.c_str())
-            );
+            layerParams.set("mode", interp_mode == "pytorch_half_pixel" ?
+                                    "opencv_linear" : "bilinear");
         }
-        for (int i = 0; i < node_proto.input_size(); i++)
+    }
+    if (layerParams.get<String>("mode") == "linear" && framework_name == "pytorch")
+        layerParams.set("mode", "opencv_linear");
+
+    // input = [X, scales], [X, roi, scales] or [x, roi, scales, sizes]
+    int foundScaleId = hasDynamicShapes ? node_proto.input_size() - 1
+                                        : node_proto.input_size() > 2 ? 2 : 1;
+
+    Mat scales = getBlob(node_proto, foundScaleId);
+    if (scales.total() == 4)
+    {
+        layerParams.set("zoom_factor_y", scales.at<float>(2));
+        layerParams.set("zoom_factor_x", scales.at<float>(3));
+    }
+    else
+    {
+        const std::string& inputLast = node_proto.input(node_proto.input_size() - 1);
+        if (constBlobs.find(inputLast) != constBlobs.end())
         {
-            CV_LOG_INFO(NULL, "    Input[" << i << "] = '" << node_proto.input(i) << "'");
+            Mat shapes = getBlob(inputLast);
+            CV_CheckEQ(shapes.size[0], 4, "");
+            CV_CheckEQ(shapes.size[1], 1, "");
+            CV_CheckDepth(shapes.depth(), shapes.depth() == CV_32S || shapes.depth() == CV_32F, "");
+            if (shapes.depth() == CV_32F)
+                shapes.convertTo(shapes, CV_32S);
+            layerParams.set("width", shapes.at<int>(3));
+            layerParams.set("height", shapes.at<int>(2));
         }
-        for (int i = 0; i < node_proto.output_size(); i++)
+    }
+    replaceLayerParam(layerParams, "mode", "interpolation");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseUpsample(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    //fused from Resize Subgraph
+    if (layerParams.has("coordinate_transformation_mode"))
+    {
+        String interp_mode = layerParams.get<String>("coordinate_transformation_mode");
+        CV_Assert_N(interp_mode != "tf_crop_and_resize", interp_mode != "tf_half_pixel_for_nn");
+
+        layerParams.set("align_corners", interp_mode == "align_corners");
+        if (layerParams.get<String>("mode") == "linear")
         {
-            CV_LOG_INFO(NULL, "    Output[" << i << "] = '" << node_proto.output(i) << "'");
+            layerParams.set("mode", interp_mode == "pytorch_half_pixel" ?
+                                    "opencv_linear" : "bilinear");
         }
-        if (DNN_DIAGNOSTICS_RUN)
+    }
+    if (layerParams.get<String>("mode") == "linear" && framework_name == "pytorch")
+        layerParams.set("mode", "opencv_linear");
+
+    layerParams.type = "Resize";
+    if (layerParams.has("scales"))
+    {
+        // Pytorch layer
+        DictValue scales = layerParams.get("scales");
+        CV_Assert(scales.size() == 4);
+        layerParams.set("zoom_factor_y", scales.getIntValue(2));
+        layerParams.set("zoom_factor_x", scales.getIntValue(3));
+    }
+    else if (layerParams.has("height_scale") && layerParams.has("width_scale"))
+    {
+        // Caffe2 layer
+        replaceLayerParam(layerParams, "height_scale", "zoom_factor_y");
+        replaceLayerParam(layerParams, "width_scale", "zoom_factor_x");
+    }
+    else
+    {
+        // scales as input
+        const std::string& input1 = node_proto.input(1);
+        if (constBlobs.find(input1) != constBlobs.end())
         {
-            for (int i = 0; i < node_proto.output_size(); ++i)
-            {
-                layer_id.insert(std::make_pair(node_proto.output(i), LayerInfo(0, i)));
-                outShapes[node_proto.output(i)] = outShapes[node_proto.input(0)];
-            }
+            Mat scales = getBlob(input1);
+            CV_Assert(scales.total() == 4);
+            layerParams.set("zoom_factor_y", scales.at<float>(2));
+            layerParams.set("zoom_factor_x", scales.at<float>(3));
         }
-        else
-            CV_Error(Error::StsError, cv::format("Node [%s]:(%s) parse error: %s", layer_type.c_str(), name.c_str(), e.what()));
     }
+    replaceLayerParam(layerParams, "mode", "interpolation");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseSoftMax(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    const std::string& layer_type = node_proto.op_type();
+    layerParams.type = "Softmax";
+    layerParams.set("log_softmax", layer_type == "LogSoftmax");
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseDetectionOutput(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto_)
+{
+    opencv_onnx::NodeProto node_proto = node_proto_;
+    CV_CheckEQ(node_proto.input_size(), 3, "");
+    if (constBlobs.find(node_proto.input(2)) != constBlobs.end())
+    {
+        Mat priors = getBlob(node_proto, 2);
+
+        LayerParams constParams;
+        constParams.name = layerParams.name + "/priors";
+        constParams.type = "Const";
+        constParams.blobs.push_back(priors);
+
+        opencv_onnx::NodeProto priorsProto;
+        priorsProto.add_output(constParams.name);
+        addLayer(constParams, priorsProto);
+
+        node_proto.set_input(2, constParams.name);
+    }
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseCumSum(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    layerParams.type = "CumSum";
+
+    // Get axis.
+    const std::string& input1 = node_proto.input(1);
+
+    if (constBlobs.find(input1) != constBlobs.end())
+    {
+        Mat axis_blob = getBlob(input1);
+        CV_Assert(axis_blob.total() == 1u);
+        layerParams.set("axis", axis_blob.at<int>(0));
+    }
+
+    addLayer(layerParams, node_proto);
+}
+
+void ONNXImporter::parseCustomLayer(LayerParams& layerParams, const opencv_onnx::NodeProto& node_proto)
+{
+    for (int j = 0; j < node_proto.input_size(); j++) {
+        if (layer_id.find(node_proto.input(j)) == layer_id.end())
+            layerParams.blobs.push_back(getBlob(node_proto, j));
+    }
+    addLayer(layerParams, node_proto);
+}
+
+const ONNXImporter::DispatchMap ONNXImporter::buildDispatchMap()
+{
+    DispatchMap dispatch;
+
+    dispatch["MaxPool"] = &ONNXImporter::parseMaxPool;
+    dispatch["AveragePool"] = &ONNXImporter::parseAveragePool;
+    dispatch["GlobalAveragePool"] = dispatch["GlobalMaxPool"] = dispatch["ReduceMean"] = dispatch["ReduceSum"] =
+            dispatch["ReduceMax"] = &ONNXImporter::parseReduce;
+    dispatch["Slice"] = &ONNXImporter::parseSlice;
+    dispatch["Split"] = &ONNXImporter::parseSplit;
+    dispatch["Add"] = dispatch["Sum"] = dispatch["Sub"] = &ONNXImporter::parseBias;
+    dispatch["Pow"] = &ONNXImporter::parsePow;
+    dispatch["Max"] = &ONNXImporter::parseMax;
+    dispatch["Neg"] = &ONNXImporter::parseNeg;
+    dispatch["Constant"] = &ONNXImporter::parseConstant;
+    dispatch["LSTM"] = &ONNXImporter::parseLSTM;
+    dispatch["GRU"] = &ONNXImporter::parseGRU;
+    dispatch["ImageScaler"] = &ONNXImporter::parseImageScaler;
+    dispatch["Clip"] = &ONNXImporter::parseClip;
+    dispatch["LeakyRelu"] = &ONNXImporter::parseLeakyRelu;
+    dispatch["Relu"] = &ONNXImporter::parseRelu;
+    dispatch["Elu"] = &ONNXImporter::parseElu;
+    dispatch["Tanh"] = &ONNXImporter::parseTanh;
+    dispatch["PRelu"] = &ONNXImporter::parsePRelu;
+    dispatch["LRN"] = &ONNXImporter::parseLRN;
+    dispatch["InstanceNormalization"] = &ONNXImporter::parseInstanceNormalization;
+    dispatch["BatchNormalization"] = &ONNXImporter::parseBatchNormalization;
+    dispatch["Gemm"] = &ONNXImporter::parseGemm;
+    dispatch["MatMul"] = &ONNXImporter::parseMatMul;
+    dispatch["Mul"] = dispatch["Div"] = &ONNXImporter::parseMul;
+    dispatch["Conv"] = &ONNXImporter::parseConv;
+    dispatch["ConvTranspose"] = &ONNXImporter::parseConvTranspose;
+    dispatch["Transpose"] = &ONNXImporter::parseTranspose;
+    dispatch["Squeeze"] = &ONNXImporter::parseSqueeze;
+    dispatch["Flatten"] = &ONNXImporter::parseFlatten;
+    dispatch["Unsqueeze"] = &ONNXImporter::parseUnsqueeze;
+    dispatch["Expand"] = &ONNXImporter::parseExpand;
+    dispatch["Reshape"] = &ONNXImporter::parseReshape;
+    dispatch["Pad"] = &ONNXImporter::parsePad;
+    dispatch["Shape"] = &ONNXImporter::parseShape;
+    dispatch["Cast"] = &ONNXImporter::parseCast;
+    dispatch["ConstantFill"] = dispatch["ConstantOfShape"] = &ONNXImporter::parseConstantFill;
+    dispatch["Gather"] = &ONNXImporter::parseGather;
+    dispatch["Concat"] = &ONNXImporter::parseConcat;
+    dispatch["Resize"] = &ONNXImporter::parseResize;
+    dispatch["Upsample"] = &ONNXImporter::parseUpsample;
+    dispatch["SoftMax"] = dispatch["LogSoftmax"] = &ONNXImporter::parseSoftMax;
+    dispatch["DetectionOutput"] = &ONNXImporter::parseDetectionOutput;
+    dispatch["CumSum"] = &ONNXImporter::parseCumSum;
+
+    return dispatch;
 }
 
 Net readNetFromONNX(const String& onnxFile)
 {
-    Net net;
-    ONNXImporter onnxImporter(net, onnxFile.c_str());
-    return net;
+    return detail::readNetDiagnostic<ONNXImporter>(onnxFile.c_str());
 }
 
 Net readNetFromONNX(const char* buffer, size_t sizeBuffer)
 {
-    Net net;
-    ONNXImporter onnxImporter(net, buffer, sizeBuffer);
-    return net;
+    return detail::readNetDiagnostic<ONNXImporter>(buffer, sizeBuffer);
 }
 
 Net readNetFromONNX(const std::vector<uchar>& buffer)
diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp
index f52334bc4597..ab2f161eaf1b 100644
--- a/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@ -30,10 +30,11 @@
 #define INF_ENGINE_RELEASE_2021_1 2021010000
 #define INF_ENGINE_RELEASE_2021_2 2021020000
 #define INF_ENGINE_RELEASE_2021_3 2021030000
+#define INF_ENGINE_RELEASE_2021_4 2021040000
 
 #ifndef INF_ENGINE_RELEASE
-#warning("IE version have not been provided via command-line. Using 2021.3 by default")
-#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2021_3
+#warning("IE version have not been provided via command-line. Using 2021.4 by default")
+#define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2021_4
 #endif
 
 #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index 65695b85045d..f87988d0a117 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -32,6 +32,8 @@ namespace cv {
 namespace dnn {
 CV__DNN_INLINE_NS_BEGIN
 
+extern bool DNN_DIAGNOSTICS_RUN;
+
 #if HAVE_PROTOBUF
 
 using ::google::protobuf::RepeatedField;
@@ -404,12 +406,53 @@ void setKSize(LayerParams &layerParams, const tensorflow::NodeDef &layer)
     }
 }
 
-void setPadding(LayerParams &layerParams, const tensorflow::NodeDef &layer)
+void setPadMode(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 {
     if (hasLayerAttr(layer, "padding"))
         layerParams.set("pad_mode", getLayerAttr(layer, "padding").s());
 }
 
+bool getExplicitPadding(LayerParams &layerParams, const tensorflow::NodeDef &layer, int64_t (&pads)[8])
+{
+    if (!layerParams.has("pad_mode") ||
+        layerParams.get("pad_mode").getStringValue() != "EXPLICIT")
+    {
+        return false;
+    }
+
+    CV_Assert(hasLayerAttr(layer, "explicit_paddings"));
+
+    const tensorflow::AttrValue& protoPads = getLayerAttr(layer, "explicit_paddings");
+    if (protoPads.list().i_size() != 8)
+    {
+        CV_Error(Error::StsNotImplemented, "Unsupported asymmetric padding configuration.");
+    }
+
+    int n = sizeof(pads) / sizeof(pads[0]);
+    for (int i = 0; i < n; ++i)
+    {
+        pads[i] = protoPads.list().i(i);
+    }
+
+    if (getDataLayout(layer) != DATA_LAYOUT_NCHW)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF:     Data format " << getLayerAttr(layer, "data_format").s() << ", assuming NHWC.");
+        // Perhaps, we have NHWC padding dimensions order.
+        //  N    H    W    C
+        // 0 1  2 3  4 5  6 7
+        std::swap(pads[2], pads[6]);
+        std::swap(pads[3], pads[7]);
+        //  N    C    W    H
+        // 0 1  2 3  4 5  6 7
+        std::swap(pads[4], pads[6]);
+        std::swap(pads[5], pads[7]);
+        //  N    C    H    W
+        // 0 1  2 3  4 5  6 7
+    }
+
+    return true;
+}
+
 Pin parsePin(const std::string &name)
 {
     Pin pin(name);
@@ -464,6 +507,8 @@ void ExcludeLayer(tensorflow::GraphDef& net, const int layer_index, const int in
         net.mutable_node()->DeleteSubrange(layer_index, 1);
 }
 
+class TFLayerHandler;
+
 class TFImporter
 {
 public:
@@ -471,6 +516,7 @@ class TFImporter
     TFImporter(Net& net, const char *dataModel, size_t lenModel,
                const char *dataConfig = NULL, size_t lenConfig = 0);
 protected:
+    std::unique_ptr<TFLayerHandler> layerHandler;
     Net& dstNet;
     void populateNet();
 
@@ -510,2057 +556,2422 @@ class TFImporter
 
 private:
     void addPermuteLayer(const int* order, const std::string& permName, Pin& inpId);
+    void setPadding(LayerParams &layerParams, const tensorflow::NodeDef &layer, std::string& inputName, float value = 0.);
+
+    friend class TFLayerHandler;
+    typedef void (TFImporter::*TFImporterNodeParser)(tensorflow::GraphDef&, const tensorflow::NodeDef&, LayerParams&);
+    typedef std::map<std::string, TFImporterNodeParser> DispatchMap;
+
+    const DispatchMap dispatch;
+    static const DispatchMap buildDispatchMap();
+
+    void parseConvolution        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseBias               (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseMatMul             (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseReshape            (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseFlatten            (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseTranspose          (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseConstant           (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseLrn                (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseConcat             (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseMaxPool            (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseAvgPool            (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseMaxPoolGrad        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parsePlaceholder        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseSplit              (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseSlice              (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseStridedSlice       (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseMul                (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseFusedBatchNorm     (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseConv2DBackpropInput(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseBlockLSTM          (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseResize             (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseL2Normalize        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parsePriorBox           (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseSoftmax            (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseCropAndResize      (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseMean               (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parsePack               (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseClipByValue        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseLeakyRelu          (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+    void parseActivation         (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
+
+    void parseCustomLayer        (tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams);
 };
 
-TFImporter::TFImporter(Net& net, const char *model, const char *config)
-    : dstNet(net)
+void TFImporter::setPadding(LayerParams &layerParams, const tensorflow::NodeDef &layer, std::string& inputName, float value)
 {
-    if (model && model[0])
-    {
-        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from file: " << model);
-        ReadTFNetParamsFromBinaryFileOrDie(model, &netBin);
-    }
-    if (config && config[0])
+    setPadMode(layerParams, layer);
+    int64_t pads[8];
+
+    if (!getExplicitPadding(layerParams, layer, pads))
     {
-        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from file: " << config);
-        ReadTFNetParamsFromTextFileOrDie(config, &netTxt);
+        return;
     }
 
-    populateNet();
+    LayerParams padLp;
+    padLp.name = layer.name() + "/pad";
+    padLp.type = "Padding";
+    padLp.set("paddings", DictValue::arrayInt(pads, sizeof(pads) / sizeof(pads[0])));
+    padLp.set("value", value);
+
+    int id = dstNet.addLayer(padLp.name, padLp.type, padLp);
+    layer_id[padLp.name] = id;
+
+    connect(layer_id, dstNet, parsePin(inputName), id, 0);
+    inputName = padLp.name;
+
+    layerParams.set("pad_mode", "VALID");
 }
 
-TFImporter::TFImporter(
-        Net& net,
-        const char *dataModel, size_t lenModel,
-        const char *dataConfig, size_t lenConfig
-)
-    : dstNet(net)
+class TFLayerHandler : public detail::LayerHandler
 {
-    if (dataModel != NULL && lenModel > 0)
-    {
-        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from memory (" << lenModel << " bytes)");
-        ReadTFNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBin);
-    }
-    if (dataConfig != NULL && lenConfig > 0)
-    {
-        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from memory (" << lenConfig << " bytes)");
-        ReadTFNetParamsFromTextBufferOrDie(dataConfig, lenConfig, &netTxt);
-    }
-    populateNet();
-}
+public:
+    explicit TFLayerHandler(TFImporter* importer_);
 
-void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
+    void fillRegistry(const tensorflow::GraphDef& net);
+    bool handleMissing(const tensorflow::NodeDef& layer);
+    void handleFailed(const tensorflow::NodeDef& layer);
+
+protected:
+    TFImporter* importer;
+};
+
+const TFImporter::DispatchMap TFImporter::buildDispatchMap()
 {
-    MatShape shape;
-    blobShapeFromTensor(tensor, shape);
-    int dims = (int)shape.size();
+    static DispatchMap dispatch;
+    dispatch["Conv2D"] = dispatch["SpaceToBatchND"] = dispatch["DepthwiseConv2dNative"] =
+            dispatch["Pad"] = dispatch["MirrorPad"] = dispatch["Conv3D"] = &TFImporter::parseConvolution;
+    dispatch["BiasAdd"] = dispatch["Add"] = dispatch["AddV2"] = dispatch["Sub"] = dispatch["AddN"] = &TFImporter::parseBias;
+    dispatch["MatMul"] = &TFImporter::parseMatMul;
+    dispatch["Reshape"] = &TFImporter::parseReshape;
+    dispatch["Flatten"] = dispatch["Squeeze"] = &TFImporter::parseFlatten;
+    dispatch["Transpose"] = &TFImporter::parseTranspose;
+    dispatch["Const"] = &TFImporter::parseConstant;
+    dispatch["LRN"] = &TFImporter::parseLrn;
+    dispatch["Concat"] = dispatch["ConcatV2"] = &TFImporter::parseConcat;
+    dispatch["MaxPool"] = dispatch["MaxPool3D"] = &TFImporter::parseMaxPool;
+    dispatch["AvgPool"] = dispatch["AvgPool3D"] = &TFImporter::parseAvgPool;
+    dispatch["MaxPoolGrad"] = &TFImporter::parseMaxPoolGrad;
+    dispatch["Placeholder"] = &TFImporter::parsePlaceholder;
+    dispatch["Split"] = &TFImporter::parseSplit;
+    dispatch["Slice"] = &TFImporter::parseSlice;
+    dispatch["StridedSlice"] = &TFImporter::parseStridedSlice;
+    dispatch["Mul"] = dispatch["RealDiv"] = &TFImporter::parseMul;
+    dispatch["FusedBatchNorm"] = dispatch["FusedBatchNormV3"] = &TFImporter::parseFusedBatchNorm;
+    dispatch["Conv2DBackpropInput"] = &TFImporter::parseConv2DBackpropInput;
+    dispatch["BlockLSTM"] = &TFImporter::parseBlockLSTM;
+    dispatch["ResizeNearestNeighbor"] = dispatch["ResizeBilinear"] = dispatch["FusedResizeAndPadConv2D"] = &TFImporter::parseResize;
+    dispatch["L2Normalize"] = &TFImporter::parseL2Normalize;
+    dispatch["PriorBox"] = &TFImporter::parsePriorBox;
+    dispatch["Softmax"] = &TFImporter::parseSoftmax;
+    dispatch["CropAndResize"] = &TFImporter::parseCropAndResize;
+    dispatch["Mean"] = dispatch["Sum"] = dispatch["Max"] = &TFImporter::parseMean;
+    dispatch["Pack"] = &TFImporter::parsePack;
+    dispatch["ClipByValue"] = &TFImporter::parseClipByValue;
+    dispatch["LeakyRelu"] = &TFImporter::parseLeakyRelu;
+    dispatch["Abs"] = dispatch["Tanh"] = dispatch["Sigmoid"] = dispatch["Relu"] =
+            dispatch["Elu"] = dispatch["Exp"] = dispatch["Identity"] = dispatch["Relu6"] = &TFImporter::parseActivation;
+
+    return dispatch;
+}
 
-    // TODO: other blob types
-    CV_Assert(tensor.dtype() == tensorflow::DT_FLOAT ||
-              tensor.dtype() == tensorflow::DT_HALF);
-    CV_Assert(dims == 4 || dims == 5);
+// "Conv2D" "SpaceToBatchND" "DepthwiseConv2dNative" "Pad" "MirrorPad" "Conv3D"
+void TFImporter::parseConvolution(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer_, LayerParams& layerParams)
+{
+    tensorflow::NodeDef layer = layer_;
+    std::string name = layer.name();
+    std::string type = layer.op();
+    int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    // The first node of dilated convolution subgraph.
+    // Extract input node, dilation rate and paddings.
+    std::string input = layer.input(0);
+    StrIntVector next_layers;
+    if (type == "SpaceToBatchND" || type == "Pad")
+    {
+        next_layers = getNextLayers(net, name, "Conv2D");
+        if (next_layers.empty())
+            next_layers = getNextLayers(net, name, "DepthwiseConv2dNative");
+    }
 
-    int out_c, input_c, depth, height, width;
-    if (dims == 4)
+    if (type == "SpaceToBatchND")
     {
-        // REORDER kernel HWIO to OIHW
-        swap(shape[0], shape[2]); // IWHO
-        swap(shape[1], shape[3]); // IOHW
-        swap(shape[0], shape[1]); // OIHW
-        depth = 1; height = shape[2]; width = shape[3];
+        // op: "SpaceToBatchND"
+        // input: "input"
+        // input: "SpaceToBatchND/block_shape"
+        // input: "SpaceToBatchND/paddings"
+        CV_CheckEQ(num_inputs, 3, "");
+
+        DictValue dilation = parseDims(getConstBlob(layer, value_id, 1));
+        CV_Assert(dilation.size() == 2);
+        layerParams.set("dilation_h", dilation.get<int>(0));
+        layerParams.set("dilation_w", dilation.get<int>(1));
+
+        Mat paddings;
+        parseTensor<int>(getConstBlob(layer, value_id, 2), paddings);
+
+        // paddings is a 2x2 matrix: [[top, bot], [left, right]]
+        layerParams.set("pad_h", paddings.at<float>(0));
+        layerParams.set("pad_w", paddings.at<float>(2));
+
+        CV_Assert(next_layers.size() == 1);
+        layers_to_ignore.insert(next_layers[0].first);
+
+        // FIXIT don't override, rewrite this code
+        layer = net.node(next_layers[0].second);
+        name = layer.name();
+        type = layer.op();
+        num_inputs = layer.input_size();
+        CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
     }
-    else
+    else if (type == "Pad" || type == "MirrorPad")
     {
-        // REORDER kernel DHWIO to OIDHW
-        swap(shape[0], shape[4]); // OHWID
-        swap(shape[1], shape[3]); // OIWHD
-        swap(shape[2], shape[4]); // OIDHW
-        depth = shape[2]; height = shape[3]; width = shape[4];
+        Mat paddings = getTensorContent(getConstBlob(layer, value_id, 1));
+        CV_Assert(paddings.type() == CV_32SC1);
+        if (paddings.total() == 8)
+        {
+            // Perhaps, we have NHWC padding dimensions order.
+            //  N    H    W    C
+            // 0 1  2 3  4 5  6 7
+            std::swap(paddings.at<int32_t>(2), paddings.at<int32_t>(6));
+            std::swap(paddings.at<int32_t>(3), paddings.at<int32_t>(7));
+            //  N    C    W    H
+            // 0 1  2 3  4 5  6 7
+            std::swap(paddings.at<int32_t>(4), paddings.at<int32_t>(6));
+            std::swap(paddings.at<int32_t>(5), paddings.at<int32_t>(7));
+            //  N    C    H    W
+            // 0 1  2 3  4 5  6 7
+        }
+
+        if (next_layers.empty() || paddings.total() != 8 ||
+            paddings.at<int32_t>(4) != paddings.at<int32_t>(5) ||
+            paddings.at<int32_t>(6) != paddings.at<int32_t>(7) || type == "MirrorPad")
+        {
+            // Just a single padding layer.
+            layerParams.set("paddings", DictValue::arrayInt<int*>((int*)paddings.data, paddings.total()));
+            if (type == "MirrorPad")
+                layerParams.set("type", "reflect");
+
+            int id = dstNet.addLayer(name, "Padding", layerParams);
+            layer_id[name] = id;
+
+            connect(layer_id, dstNet, parsePin(input), id, 0);
+            return;
+        }
+        else
+        {
+            // Merge with subsequent convolutional layer.
+            CV_Assert(next_layers.size() == 1);
+
+            layerParams.set("pad_h", paddings.at<int32_t>(4));
+            layerParams.set("pad_w", paddings.at<int32_t>(6));
+
+            layers_to_ignore.insert(next_layers[0].first);
+
+            // FIXIT don't override, rewrite this code
+            layer = net.node(next_layers[0].second);
+            name = layer.name();
+            type = layer.op();
+            num_inputs = layer.input_size();
+            CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
+        }
     }
-    out_c = shape[0]; input_c = shape[1];
 
-    dstBlob.create(shape, CV_32F);
+    // For the object detection networks, TensorFlow Object Detection API
+    // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
+    // order. We can manage it at DetectionOutput layer parsing predictions
+    // or shuffle last convolution's weights.
+    bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
+                             getLayerAttr(layer, "loc_pred_transposed").b();
 
-    Mat tensorContent = getTensorContent(tensor, /*no copy*/false);
-    int size = tensorContent.total();
-    CV_Assert(size == (int)dstBlob.total());
+    layerParams.set("bias_term", false);
+    layerParams.blobs.resize(1);
 
-    float *dstData = dstBlob.ptr<float>();
-    const float *data = reinterpret_cast<const float*>(tensorContent.data);
+    next_layers = getNextLayers(net, name, "BiasAdd");
+    if (next_layers.size() == 1) {
+        layerParams.set("bias_term", true);
+        layerParams.blobs.resize(2);
 
-    int total = out_c * input_c * depth * height * width;
-    for (int i_oc = 0; i_oc < out_c; i_oc++) {
-        for (int i_ic = 0; i_ic < input_c; i_ic++) {
-            for (int i_d = 0; i_d < depth; i_d++) {
-                for (int i_h = 0; i_h < height; i_h++) {
-                    for (int i_w = 0; i_w < width; i_w++) {
-                        int dst_i = input_c * depth * height * width * i_oc +
-                                    depth * height * width * i_ic + height * width * i_d + width * i_h + i_w;
-                        int src_i = out_c * input_c * width * height * i_d +
-                                    out_c * input_c * width * i_h + out_c * input_c * i_w + out_c * i_ic + i_oc;
-                        CV_Assert(dst_i < total);
-                        CV_Assert(src_i < total);
-                       dstData[dst_i] = data[src_i];
-                   }
-                }
+        int weights_layer_index = next_layers[0].second;
+
+        blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
+        ExcludeLayer(net, weights_layer_index, 0, false);
+        layers_to_ignore.insert(next_layers[0].first);
+
+        // Shuffle bias from yxYX to xyXY.
+        if (locPredTransposed)
+        {
+            const int numWeights = layerParams.blobs[1].total();
+            float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
+            CV_Assert(numWeights % 4 == 0);
+            for (int i = 0; i < numWeights; i += 2)
+            {
+                std::swap(biasData[i], biasData[i + 1]);
             }
         }
     }
-}
 
-void TFImporter::connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
-             const int input_layer_id, const int input_blob_id)
-{
-    std::map<String, int>::const_iterator it = layers_name_id_map.find(outPin.name);
-    if (it == layers_name_id_map.end())
-        CV_Error(Error::StsError, "Input layer not found: " + outPin.name);
+    int kernelTensorInpId = -1;
+    const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernelTensorInpId);
+    const String kernelTensorName = layer.input(kernelTensorInpId);
+    std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
+    if (sharedWeightsIt == sharedWeights.end())
+    {
+        kernelFromTensor(kernelTensor, layerParams.blobs[0]);
+        releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
+
+        int* kshape = layerParams.blobs[0].size.p;
+        const int outCh = kshape[0];
+        const int inCh = kshape[1];
+        const int height = kshape[2];
+        const int width = kshape[3];
+        if (type == "DepthwiseConv2dNative")
+        {
+            CV_Assert(!locPredTransposed);
+            const int chMultiplier = kshape[0];
+
+            Mat copy = layerParams.blobs[0].clone();
+            float* src = (float*)copy.data;
+            float* dst = (float*)layerParams.blobs[0].data;
+            for (int i = 0; i < chMultiplier; ++i)
+                for (int j = 0; j < inCh; ++j)
+                    for (int s = 0; s < height * width; ++s)
+                    {
+                        int src_i = (i * inCh + j) * height * width + s;
+                        int dst_i = (j * chMultiplier + i) * height* width + s;
+                        dst[dst_i] = src[src_i];
+                    }
+            // TODO Use reshape instead
+            kshape[0] = inCh * chMultiplier;
+            kshape[1] = 1;
+            size_t* kstep = layerParams.blobs[0].step.p;
+            kstep[0] = kstep[1]; // fix steps too
+        }
 
-    std::vector<String>::iterator inpNameIt = std::find(netInputsNames.begin(), netInputsNames.end(), outPin.name);
-    int blobIndex;
-    if (inpNameIt == netInputsNames.end())
-        blobIndex = outPin.blobIndex;
+        // Shuffle output channels from yxYX to xyXY.
+        if (locPredTransposed)
+        {
+            const int slice = height * width * inCh;
+            for (int i = 0; i < outCh; i += 2)
+            {
+                cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
+                cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
+                std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+            }
+        }
+        sharedWeights[kernelTensorName] = layerParams.blobs[0];
+    }
     else
-        blobIndex = inpNameIt - netInputsNames.begin();
-    network.connect(it->second, blobIndex, input_layer_id, input_blob_id);
-}
+    {
+        layerParams.blobs[0] = sharedWeightsIt->second;
+    }
+    Mat weights = layerParams.blobs[0];
+    layerParams.set("kernel_size",  DictValue::arrayInt(&weights.size[2], weights.dims - 2));
 
-void TFImporter::connectToAllBlobs(const std::map<String, int>& layer_id, Net& network, const Pin& outPin,
-                     const int input_layer_id, const int input_blobs_count)
-{
-    for (int input_blob_id = 0; input_blob_id < input_blobs_count; input_blob_id++)
-        connect(layer_id, network, outPin, input_layer_id, input_blob_id);
-}
+    layerParams.set("num_output", layerParams.blobs[0].size[0]);
 
-const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDef &layer, std::map<String, int> const_layers,
-                                              int input_blob_index, int* actual_inp_blob_idx) {
-    if (input_blob_index == -1) {
-        for(int i = 0; i < layer.input_size(); i++) {
-            Pin input = parsePin(layer.input(i));
-            if (const_layers.find(input.name) != const_layers.end()) {
-                if (input_blob_index != -1)
-                    CV_Error(Error::StsError, "More than one input is Const op");
+    setStrides(layerParams, layer);
+    if (!layerParams.has("pad_w") && !layerParams.has("pad_h"))
+        setPadding(layerParams, layer, input);
 
-                input_blob_index = i;
-            }
-        }
+    // The final node of dilated convolution subgraph.
+    next_layers = getNextLayers(net, name, "BatchToSpaceND");
+    if (!next_layers.empty())
+    {
+        CV_Assert(next_layers.size() == 1);
+        ExcludeLayer(net, next_layers[0].second, 0, false);
+        layers_to_ignore.insert(next_layers[0].first);
     }
 
-    if (input_blob_index == -1)
-        CV_Error(Error::StsError, "Const input blob for weights not found");
+    int id = dstNet.addLayer(name, "Convolution", layerParams);
+    layer_id[name] = id;
 
-    Pin kernel_inp = parsePin(layer.input(input_blob_index));
-    if (const_layers.find(kernel_inp.name) == const_layers.end())
-        CV_Error(Error::StsError, "Input [" + layer.input(input_blob_index) +
-                                  "] for node [" + layer.name() + "] not found");
-    if (kernel_inp.blobIndex != 0)
-        CV_Error(Error::StsError, "Unsupported kernel input");
+    // one input only
+    connect(layer_id, dstNet, parsePin(input), id, 0);
 
-    if(actual_inp_blob_idx) {
-        *actual_inp_blob_idx = input_blob_index;
+
+    if (getDataLayout(name, data_layouts) == DATA_LAYOUT_UNKNOWN)
+        data_layouts[name] = DATA_LAYOUT_NHWC;
+}
+
+// "BiasAdd" "Add" "AddV2" "Sub" "AddN"
+void TFImporter::parseBias(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    bool haveConst = false;
+    for(int ii = 0; !haveConst && ii < num_inputs; ++ii)
+    {
+        Pin input = parsePin(layer.input(ii));
+        haveConst = value_id.find(input.name) != value_id.end();
     }
+    CV_Assert(!haveConst || num_inputs == 2);
 
-    int nodeIdx = const_layers.at(kernel_inp.name);
-    if (nodeIdx < netBin.node_size() && netBin.node(nodeIdx).name() == kernel_inp.name)
+    if (haveConst)
     {
-        return netBin.node(nodeIdx).attr().at("value").tensor();
+        Mat values = getTensorContent(getConstBlob(layer, value_id));
+        CV_Assert(values.type() == CV_32FC1);
+        if (type == "Sub")
+            values *= -1.0f;
+
+        int id;
+        if (values.total() == 1)  // is a scalar.
+        {
+            layerParams.set("shift", values.at<float>(0));
+            id = dstNet.addLayer(name, "Power", layerParams);
+        }
+        else  // is a vector
+        {
+            layerParams.blobs.resize(1, values);
+            id = dstNet.addLayer(name, "Shift", layerParams);
+        }
+        layer_id[name] = id;
+
+        // one input only
+        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
     }
     else
     {
-        CV_Assert_N(nodeIdx < netTxt.node_size(),
-                    netTxt.node(nodeIdx).name() == kernel_inp.name);
-        return netTxt.node(nodeIdx).attr().at("value").tensor();
+        layerParams.set("operation", "sum");
+        if (type == "Sub")
+        {
+            static float subCoeffs[] = {1.f, -1.f};
+            layerParams.set("coeff", DictValue::arrayReal<float*>(subCoeffs, 2));
+        }
+
+        int id = dstNet.addLayer(name, "Eltwise", layerParams);
+        layer_id[name] = id;
+
+        for (int ii = 0; ii < num_inputs; ii++)
+        {
+            Pin inp = parsePin(layer.input(ii));
+            if (layer_id.find(inp.name) == layer_id.end())
+                CV_Error(Error::StsError, "Input layer not found: " + inp.name);
+            connect(layer_id, dstNet, inp, id, ii);
+        }
     }
 }
 
-static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& const_layers,
-                          std::set<String>& layers_to_ignore)
+void TFImporter::parseMatMul(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
-    CV_LOG_DEBUG(NULL, "DNN/TF: addConstNodes(): handling " << net.node_size() << " nodes...");
-    for (int li = 0; li < net.node_size(); li++)
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 2, "");
+
+    // For the object detection networks, TensorFlow Object Detection API
+    // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
+    // order. We can manage it at DetectionOutput layer parsing predictions
+    // or shuffle last Faster-RCNN's matmul weights.
+    bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
+                             getLayerAttr(layer, "loc_pred_transposed").b();
+
+    layerParams.set("bias_term", false);
+    layerParams.blobs.resize(1);
+
+    StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");  // FIXIT Use layers fusion instead
+    if (next_layers.empty())
     {
-        const tensorflow::NodeDef &layer = net.node(li);
-        String name = layer.name();
-        String type = layer.op();
+        next_layers = getNextLayers(net, name, "Add");
+    }
+    if (next_layers.size() == 1) {
+        layerParams.set("bias_term", true);
+        layerParams.blobs.resize(2);
 
-        //CV_LOG_DEBUG(NULL, "DNN/TF: layer_id=" << li << " - '" << name << "' @ " << type);
+        int weights_layer_index = next_layers[0].second;
+        blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
+        ExcludeLayer(net, weights_layer_index, 0, false);
+        layers_to_ignore.insert(next_layers[0].first);
 
-        try
+        if (locPredTransposed)
         {
-            if (type == "Dequantize")
+            const int numWeights = layerParams.blobs[1].total();
+            float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
+            CV_Assert(numWeights % 4 == 0);
+            for (int i = 0; i < numWeights; i += 2)
             {
-                // Example of Dequantize node:
-                //   name: "conv2d_1/bias"
-                //   op: "Dequantize"
-                //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
-                //   input: "conv2d_1/bias_quantized_min"
-                //   input: "conv2d_1/bias_quantized_max"
-                //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
-                //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
-                CV_CheckEQ(layer.input_size(), 3, "Dequantize: 3 inputs is supported only");
-                for (int i = 0; i < 3; ++i)
-                    CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
-                CV_Assert(hasLayerAttr(layer, "mode") &&
-                          getLayerAttr(layer, "mode").s() == "MIN_FIRST");
+                std::swap(biasData[i], biasData[i + 1]);
+            }
+        }
+    }
 
-                int tensorId = const_layers[layer.input(0)];
-                int minId = const_layers[layer.input(1)];
-                int maxId = const_layers[layer.input(2)];
+    int kernel_blob_index = -1;
+    const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernel_blob_index);
+    const String kernelTensorName = layer.input(kernel_blob_index);
+    std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
+    if (sharedWeightsIt == sharedWeights.end())
+    {
+        blobFromTensor(kernelTensor, layerParams.blobs[0]);
+        releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
+        sharedWeights[kernelTensorName] = layerParams.blobs[0];
+    }
+    else
+    {
+        layerParams.blobs[0] = sharedWeightsIt->second;
+    }
 
-                tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
-                                                    ->mutable_attr()->at("value")
-                                                     .mutable_tensor();
-                CV_CheckEQ((int)tensor->dtype(), (int)tensorflow::DT_QUINT8, "");
+    if (kernel_blob_index == 1) { // In this case output is computed by x*W formula - W should be transposed
+        Mat data = layerParams.blobs[0].t();
+        layerParams.blobs[0] = data.clone();
+    }
 
-                Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
-                Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
-                CV_CheckEQ(qMin.total(), (size_t)1, "");
-                CV_CheckTypeEQ(qMin.type(), CV_32FC1, "");
-                CV_CheckEQ(qMax.total(), (size_t)1, "");
-                CV_CheckTypeEQ(qMax.type(), CV_32FC1, "");
+    layerParams.set("num_output", layerParams.blobs[0].size[0]);
+    if (locPredTransposed)
+    {
+        CV_Assert(layerParams.blobs[0].dims == 2);
+        for (int i = 0; i < layerParams.blobs[0].size[0]; i += 2)
+        {
+            cv::Mat src = layerParams.blobs[0].row(i);
+            cv::Mat dst = layerParams.blobs[0].row(i + 1);
+            std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
+        }
+    }
 
-                Mat content = getTensorContent(*tensor);
-
-                float minVal = qMin.at<float>(0);
-                float rangeScale = (qMax.at<float>(0) - minVal) / 255;
-                CV_Assert(rangeScale >= 0);
-                content.convertTo(content, CV_32FC1, rangeScale,
-                                  rangeScale * cvRound(minVal / rangeScale));
-
-                tensor->set_dtype(tensorflow::DT_FLOAT);
-                tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
-
-                net.mutable_node(tensorId)->set_name(name);
-                CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
-                layers_to_ignore.insert(name);
-                continue;
-            }
-            else if (type != "Const")
-                continue;  // only Const parameters are supported
+    int id = dstNet.addLayer(name, "InnerProduct", layerParams);
+    layer_id[name] = id;
 
-            if (layer.attr().find("value") != layer.attr().end())
-            {
-                CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
-            }
-            layers_to_ignore.insert(name);
-        }
-        catch (const std::exception& e)
-        {
-            CV_LOG_ERROR(NULL, "DNN/TF: Can't handle node='" << name << "'. Exception: " << e.what());
-            throw;
-        }
-    }
-    CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
+    // one input only
+    int input_blob_index = kernel_blob_index == 0 ? 1 : 0;
+    connect(layer_id, dstNet, parsePin(layer.input(input_blob_index)), id, 0);
+    data_layouts[name] = DATA_LAYOUT_PLANAR;
 }
 
-// If all inputs of specific layer have the same data layout we can say that
-// this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
-DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
+void TFImporter::parseReshape(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
-    DataLayout layout = getDataLayout(layer);
-    if (layout != DATA_LAYOUT_UNKNOWN)
-    {
-        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from attrs)");
-        return layout;
-    }
-
-    // Determine layout by layer's inputs
-    for (int i = 0, n = layer.input_size(); i < n; ++i)
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    Pin inpId = parsePin(layer.input(0));
+    DataLayout inpLayout = getDataLayout(layer.input(0), data_layouts);
+    // There are two possible implementations: reshape an input using
+    // predefined sizes or use a second input blob as a source of new shape.
+    if (value_id.find(layer.input(1)) != value_id.end())
     {
-        std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layer.input(i)));
-        if (it != data_layouts.end())
+        Mat newShape = getTensorContent(getConstBlob(layer, value_id, 1));
+        int newShapeSize = newShape.total();
+        bool hasSwap = false;
+        if (newShapeSize == 4 && hasAllOnes(newShape, 0, 2))
         {
-            if (layout != DATA_LAYOUT_UNKNOWN)
+            // NHWC->NCHW
+            std::swap(*newShape.ptr<int32_t>(0, 2), *newShape.ptr<int32_t>(0, 3));
+            std::swap(*newShape.ptr<int32_t>(0, 1), *newShape.ptr<int32_t>(0, 2));
+            hasSwap = true;
+        }
+        if (inpLayout == DATA_LAYOUT_NHWC)
+        {
+            if (newShapeSize >= 2 || newShape.at<int>(1) == 1)
             {
-                if (it->second != layout && it->second != DATA_LAYOUT_UNKNOWN)
-                    return DATA_LAYOUT_UNKNOWN;
+                int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                addPermuteLayer(order, name + "/nhwc", inpId);
+                if (newShapeSize < 4)
+                {
+                    inpLayout = DATA_LAYOUT_NCHW;
+                }
+                else
+                {
+                    inpLayout = DATA_LAYOUT_NHWC;
+                }
             }
-            else
-                layout = it->second;
         }
-    }
+        layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShapeSize));
 
-    if (layout != DATA_LAYOUT_UNKNOWN)
+        int id = dstNet.addLayer(name, "Reshape", layerParams);
+        layer_id[name] = id;
+
+        // one input only
+        connect(layer_id, dstNet, inpId, id, 0);
+        inpId = Pin(name);
+
+        if ((inpLayout == DATA_LAYOUT_NHWC || inpLayout == DATA_LAYOUT_UNKNOWN || inpLayout == DATA_LAYOUT_PLANAR) &&
+            newShapeSize == 4 && !hasSwap)
+        {
+            int order[] = {0, 3, 1, 2};  // Transform back to OpenCV's NCHW.
+            addPermuteLayer(order, name + "/nchw", inpId);
+            inpLayout = DATA_LAYOUT_NCHW;
+        }
+
+        data_layouts[name] = newShapeSize == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
+    }
+    else
     {
-        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from inputs)");
-        return layout;
+        int id = dstNet.addLayer(name, "Reshape", layerParams);
+        layer_id[name] = id;
+        connect(layer_id, dstNet, inpId, id, 0);
+        connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
+        data_layouts[name] = inpLayout;
     }
-
-    // Determine layout by layer's consumers recursively.
-    std::map<String, DataLayout>::const_iterator it = data_layouts.find(layer.name());
-    CV_Assert(it != data_layouts.end());
-    return it->second;
 }
 
-void TFImporter::populateNet()
+// "Flatten" "Squeeze"
+void TFImporter::parseFlatten(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
-    CV_Assert(netBin.ByteSize() || netTxt.ByteSize());
-
-    CV_LOG_INFO(NULL, "DNN/TF: parsing model"
-        << (netBin.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netBin.versions().producer(), (int)netBin.versions().min_consumer()) : cv::String(" (N/A version info)"))
-        << ". Number of nodes = " << netBin.node_size()
-    );
-
-    if (netTxt.ByteSize())
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    Pin inpId = parsePin(layer.input(0));
+    int inpLayout = getDataLayout(layer.input(0), data_layouts);
+    if (type == "Squeeze")
     {
-        CV_LOG_INFO(NULL, "DNN/TF: parsing config"
-            << (netTxt.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netTxt.versions().producer(), (int)netTxt.versions().min_consumer()) : cv::String(" (N/A version info)"))
-            << ". Number of nodes = " << netTxt.node_size()
-        );
-
-        RemoveIdentityOps(netBin);
-        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
-        RemoveIdentityOps(netTxt);
-        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(config) => " << netTxt.node_size() << " nodes");
-
-        sortByExecutionOrder(netTxt);
-        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(config) => " << netTxt.node_size() << " nodes");
+        CV_Assert(hasLayerAttr(layer, "squeeze_dims"));
+        const tensorflow::AttrValue& dims = getLayerAttr(layer, "squeeze_dims");
+        std::vector<int> dimsVector(dims.list().i_size());
+        for (int i = 0; i < dimsVector.size(); ++i)
+            dimsVector[i] = dims.list().i(i);
+
+        // Flatten layer can squeeze dimensions range into one.
+        std::sort(dimsVector.begin(), dimsVector.end());
+        for (int i = 1; i < dimsVector.size(); ++i)
+        {
+            if (dimsVector[i] != dimsVector[i - 1] + 1)
+                CV_Error(Error::StsNotImplemented, "Unsupported squeeze configuration");
+        }
+        int start = dimsVector.front() - 1, end = dimsVector.back();
+        if (start == -1 && end == 0)  // squeeze 0th dimension
+        {
+            start = 0;
+            end = 1;
+        }
+        layerParams.set("axis", start);
+        layerParams.set("end_axis", end);
     }
-    else
+    if (inpLayout == DATA_LAYOUT_NHWC)
     {
-        removePhaseSwitches(netBin);
-        CV_LOG_DEBUG(NULL, "DNN/TF: removePhaseSwitches(model) => " << netBin.node_size() << " nodes");
-
-        RemoveIdentityOps(netBin);
-        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
-
-        simplifySubgraphs(netBin);
-        CV_LOG_DEBUG(NULL, "DNN/TF: simplifySubgraphs(model) => " << netBin.node_size() << " nodes");
-        sortByExecutionOrder(netBin);
-        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(model) => " << netBin.node_size() << " nodes");
+        LayerParams permLP;
+        int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+        permLP.set("order", DictValue::arrayInt<int*>(order, 4));
+
+        std::string permName = name + "/nchw";
+        CV_Assert(layer_id.find(permName) == layer_id.end());
+        int permId = dstNet.addLayer(permName, "Permute", permLP);
+        layer_id[permName] = permId;
+        connect(layer_id, dstNet, inpId, permId, 0);
+        inpId = Pin(permName);
     }
+    int id = dstNet.addLayer(name, "Flatten", layerParams);
+    layer_id[name] = id;
+    connect(layer_id, dstNet, inpId, id, 0);
+    data_layouts[name] = DATA_LAYOUT_PLANAR;
+}
 
-    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
-
-    int layersSize = net.node_size();
-
-    // Pre-fill data layouts where they are set explicitly.
-    // Assuming that nodes are in topological order
-    for (int i = layersSize - 1; i >= 0; --i)
+void TFImporter::parseTranspose(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    Mat perm = getTensorContent(getConstBlob(layer, value_id, 1));
+    CV_Assert(perm.type() == CV_32SC1);
+    int* permData = (int*)perm.data;
+    if (perm.total() == 4)
     {
-        const tensorflow::NodeDef& layer = net.node(i);
-        std::string name = layer.name();
-
-        CV_LOG_DEBUG(NULL, "DNN/TF: node(" << i << " - '" << name << "') propagating layout...");
-
-        try
+        // Only NHWC <-> NCHW permutations are allowed. OpenCV is always
+        // keep NCHW layout this way.
+        int inpLayout = getDataLayout(layer.input(0), data_layouts);
+        std::string type = "Identity";
+        if (inpLayout == DATA_LAYOUT_NHWC)
         {
-            DataLayout layout = getDataLayout(layer);
-            std::map<String, DataLayout>::iterator it = data_layouts.find(name);
-            if (it != data_layouts.end())
+            if (permData[0] == 0 && permData[1] == 3 && permData[2] == 1 && permData[3] == 2)
             {
-                if (layout != DATA_LAYOUT_UNKNOWN)
-                {
-                    if (it->second == DATA_LAYOUT_UNKNOWN)
-                        it->second = layout;
-                    else if (it->second != layout)
-                    {
-                        it->second = DATA_LAYOUT_UNKNOWN;
-                        layout = DATA_LAYOUT_UNKNOWN;
-                    }
-                }
-                else
-                    layout = it->second;
+                // in TensorFlow: NHWC->NCHW
+                // in OpenCV: NCHW->NCHW
+                data_layouts[name] = DATA_LAYOUT_NCHW;
             }
-            else
-                data_layouts[name] = layout;
-
-            // Specify input layers to have the same data layout.
-            for (int j = 0; j < layer.input_size(); ++j)
+            else if (permData[0] == 0 && permData[1] == 1 && permData[2] == 2 && permData[3] == 3)
             {
-                name = getNodeName(layer.input(j));
-                it = data_layouts.find(name);
-                if (it != data_layouts.end())
-                {
-                    if (layout != DATA_LAYOUT_UNKNOWN)
-                    {
-                        if (it->second == DATA_LAYOUT_UNKNOWN)
-                            it->second = layout;
-                        else if (it->second != layout)
-                            it->second = DATA_LAYOUT_UNKNOWN;
-                    }
-                }
-                else
-                    data_layouts[name] = layout;
+                // in TensorFlow: NHWC->NHWC
+                // in OpenCV: NCHW->NCHW
+                data_layouts[name] = DATA_LAYOUT_NHWC;
+            }
+            else if (permData[0] == 0 && permData[1] == 3 && permData[2] == 2 && permData[3] == 1)
+            {
+                // in TensorFlow: NHWC->NCWH
+                // in OpenCV: NCHW->NCWH
+                int permData[] = {0, 1, 3, 2};
+                layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
+                data_layouts[name] = DATA_LAYOUT_NCHW;  // we keep track NCHW because channels position only matters
+                type = "Permute";
             }
+            else
+                CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
         }
-        catch (const std::exception& e)
+        else if (inpLayout == DATA_LAYOUT_NCHW)
         {
-            CV_LOG_ERROR(NULL, "DNN/TF: Can't propagate layout for node='" << name << "'. Exception: " << e.what());
-            throw;
+            if (permData[0] == 0 && permData[1] == 2 && permData[2] == 3 && permData[3] == 1)
+            {
+                // in TensorFlow: NCHW->NHWC
+                // in OpenCV: NCHW->NCHW
+                data_layouts[name] = DATA_LAYOUT_NHWC;
+            }
+            else if (permData[0] == 0 && permData[1] == 1 && permData[2] == 2 && permData[3] == 3)
+            {
+                // in TensorFlow: NCHW->NCHW
+                // in OpenCV: NCHW->NCHW
+                data_layouts[name] = DATA_LAYOUT_NCHW;
+            }
+            else
+                CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
         }
+        int id = dstNet.addLayer(name, type, layerParams);
+        layer_id[name] = id;
+        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
     }
-
-    addConstNodes(netBin, value_id, layers_to_ignore);
-    addConstNodes(netTxt, value_id, layers_to_ignore);
-
-
-    for (int li = 0; li < layersSize; li++)
+    else
     {
-        const tensorflow::NodeDef& layer = net.node(li);
+        layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
 
-        const std::string name = layer.name();
-        const std::string type = layer.op();
-        const int ninputs = layer.input_size();
-        CV_LOG_DEBUG(NULL, "DNN/TF: (" << li << "/" << layersSize << ") Parse layer " << name << " @ " << type << " with " << ninputs << " inputs");
+        int id = dstNet.addLayer(name, "Permute", layerParams);
+        layer_id[name] = id;
 
-        parseNode(layer);
+        // one input only
+        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        data_layouts[name] = DATA_LAYOUT_UNKNOWN;
     }
+}
 
-    for (size_t i = 0; i < netInputsNames.size(); i++)
-    {
-        CV_LOG_DEBUG(NULL, "DNN/TF: Model input: " << i << " - '" << netInputsNames[i] << "'");
-        CV_Assert(!netInputsNames[i].empty());
-    }
-    dstNet.setInputsNames(netInputsNames);
-    CV_LOG_DEBUG(NULL, "DNN/TF: ===================== Import completed =====================");
+void TFImporter::parseConstant(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
 }
 
-void TFImporter::addPermuteLayer(const int* order, const std::string& permName, Pin& inpId)
+void TFImporter::parseLrn(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
-    LayerParams permLP;
-    permLP.set("order", DictValue::arrayInt<const int*>(order, 4));
-    CV_Assert(layer_id.find(permName) == layer_id.end());
-    int permId = dstNet.addLayer(permName, "Permute", permLP);
-    layer_id[permName] = permId;
-    connect(layer_id, dstNet, inpId, permId, 0);
-    inpId = Pin(permName);
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    if(hasLayerAttr(layer, "alpha")) {
+        layerParams.set("alpha", getLayerAttr(layer, "alpha").f());
+    }
+    if(hasLayerAttr(layer, "beta")) {
+        layerParams.set("beta", getLayerAttr(layer, "beta").f());
+    }
+    if(hasLayerAttr(layer, "depth_radius")) {
+        int radius = (int)getLayerAttr(layer, "depth_radius").i();
+        layerParams.set("local_size", 2*radius + 1);
+    }
+    if(hasLayerAttr(layer, "bias")) {
+        layerParams.set("bias", getLayerAttr(layer, "bias").f());
+    }
+    layerParams.set("norm_by_size", false);
+
+    int id = dstNet.addLayer(name, "LRN", layerParams);
+    layer_id[name] = id;
+
+    connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
 }
 
-void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
+// "Concat" "ConcatV2"
+void TFImporter::parseConcat(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
 {
-    tensorflow::NodeDef layer = layer_;
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
 
-    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+    CV_CheckGT(num_inputs, 0, "");
+    int axisId = (type == "Concat" ? 0 : num_inputs - 1);
+    int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
 
-    /*const*/ std::string name = layer.name();
-    /*const*/ std::string type = layer.op();
-    /*const*/ int num_inputs = layer.input_size();
+    if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+        axis = toNCHW(axis);
+    else if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NDHWC)
+        axis = toNCDHW(axis);
+    layerParams.set("axis", axis);
 
-    try
-    {
-        LayerParams layerParams;
+    // input(0) or input(n-1) is concat_dim
+    int from = (type == "Concat" ? 1 : 0);
+    int to = (type == "Concat" ? num_inputs : num_inputs - 1);
 
-        if (layers_to_ignore.find(name) != layers_to_ignore.end())
+    for (int ii = from; ii < to; ii++)
+    {
+        Pin inp = parsePin(layer.input(ii));
+        if (layer_id.find(inp.name) == layer_id.end())
         {
-            CV_LOG_DEBUG(NULL, "DNN/TF:     ignored");
-            return;
+            // There are constant inputs.
+            LayerParams lp;
+            lp.name = inp.name;
+            lp.type = "Const";
+            lp.blobs.resize(1);
+            blobFromTensor(getConstBlob(layer, value_id, ii), lp.blobs.back());
+            CV_Assert_N(!lp.blobs[0].empty(), lp.blobs[0].type() == CV_32F);
+
+            int constInpId = dstNet.addLayer(lp.name, lp.type, lp);
+            layer_id[lp.name] = constInpId;
         }
+    }
 
-        DataLayout predictedLayout = predictOutputDataLayout(layer);
-        data_layouts[name] = predictedLayout;
-
-        if (type == "Conv2D" || type == "SpaceToBatchND" || type == "DepthwiseConv2dNative" || type == "Pad" || type == "MirrorPad" || type == "Conv3D")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            // The first node of dilated convolution subgraph.
-            // Extract input node, dilation rate and paddings.
-            std::string input = layer.input(0);
-            StrIntVector next_layers;
-            if (type == "SpaceToBatchND" || type == "Pad")
-            {
-                next_layers = getNextLayers(net, name, "Conv2D");
-                if (next_layers.empty())
-                    next_layers = getNextLayers(net, name, "DepthwiseConv2dNative");
-            }
+    int id = dstNet.addLayer(name, "Concat", layerParams);
+    layer_id[name] = id;
 
-            if (type == "SpaceToBatchND")
-            {
-                // op: "SpaceToBatchND"
-                // input: "input"
-                // input: "SpaceToBatchND/block_shape"
-                // input: "SpaceToBatchND/paddings"
-                CV_CheckEQ(num_inputs, 3, "");
+    for (int ii = from; ii < to; ii++)
+    {
+        Pin inp = parsePin(layer.input(ii));
+        if (layer_id.find(inp.name) == layer_id.end())
+            CV_Error(Error::StsError, "Input layer not found: " + inp.name);
+        connect(layer_id, dstNet, inp, id, ii - from);
+    }
+}
 
-                DictValue dilation = parseDims(getConstBlob(layer, value_id, 1));
-                CV_Assert(dilation.size() == 2);
-                layerParams.set("dilation_h", dilation.get<int>(0));
-                layerParams.set("dilation_w", dilation.get<int>(1));
+// "MaxPool" "MaxPool3D"
+void TFImporter::parseMaxPool(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+    std::string inputName = layer.input(0);
 
-                Mat paddings;
-                parseTensor<int>(getConstBlob(layer, value_id, 2), paddings);
+    CV_CheckGT(num_inputs, 0, "");
+    layerParams.set("pool", "max");
 
-                // paddings is a 2x2 matrix: [[top, bot], [left, right]]
-                layerParams.set("pad_h", paddings.at<float>(0));
-                layerParams.set("pad_w", paddings.at<float>(2));
+    setKSize(layerParams, layer);
+    setStrides(layerParams, layer);
+    setPadding(layerParams, layer, inputName, -std::numeric_limits<float>::infinity());
+    // Test_TensorFlow_nets.EAST_text_detection/1, NGRAPH/CPU
+    layerParams.set("ceil_mode", false);
 
-                CV_Assert(next_layers.size() == 1);
-                layers_to_ignore.insert(next_layers[0].first);
+    int id = dstNet.addLayer(name, "Pooling", layerParams);
+    layer_id[name] = id;
 
-                // FIXIT don't override, rewrite this code
-                layer = net.node(next_layers[0].second);
-                name = layer.name();
-                type = layer.op();
-                num_inputs = layer.input_size();
-                CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
-            }
-            else if (type == "Pad" || type == "MirrorPad")
-            {
-                Mat paddings = getTensorContent(getConstBlob(layer, value_id, 1));
-                CV_Assert(paddings.type() == CV_32SC1);
-                if (paddings.total() == 8)
-                {
-                    // Perhaps, we have NHWC padding dimensions order.
-                    //  N    H    W    C
-                    // 0 1  2 3  4 5  6 7
-                    std::swap(paddings.at<int32_t>(2), paddings.at<int32_t>(6));
-                    std::swap(paddings.at<int32_t>(3), paddings.at<int32_t>(7));
-                    //  N    C    W    H
-                    // 0 1  2 3  4 5  6 7
-                    std::swap(paddings.at<int32_t>(4), paddings.at<int32_t>(6));
-                    std::swap(paddings.at<int32_t>(5), paddings.at<int32_t>(7));
-                    //  N    C    H    W
-                    // 0 1  2 3  4 5  6 7
-                }
+    connectToAllBlobs(layer_id, dstNet, parsePin(inputName), id, num_inputs);
+}
 
-                if (next_layers.empty() || paddings.total() != 8 ||
-                    paddings.at<int32_t>(4) != paddings.at<int32_t>(5) ||
-                    paddings.at<int32_t>(6) != paddings.at<int32_t>(7) || type == "MirrorPad")
-                {
-                    // Just a single padding layer.
-                    layerParams.set("paddings", DictValue::arrayInt<int*>((int*)paddings.data, paddings.total()));
-                    if (type == "MirrorPad")
-                        layerParams.set("type", "reflect");
+// "AvgPool" "AvgPool3D"
+void TFImporter::parseAvgPool(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-                    int id = dstNet.addLayer(name, "Padding", layerParams);
-                    layer_id[name] = id;
+    CV_CheckGT(num_inputs, 0, "");
+    layerParams.set("pool", "ave");
+    layerParams.set("ave_pool_padded_area", false);
+    setKSize(layerParams, layer);
+    setStrides(layerParams, layer);
+    setPadMode(layerParams, layer);
 
-                    connect(layer_id, dstNet, parsePin(input), id, 0);
-                    return;
-                }
-                else
-                {
-                    // Merge with subsequent convolutional layer.
-                    CV_Assert(next_layers.size() == 1);
+    int id = dstNet.addLayer(name, "Pooling", layerParams);
+    layer_id[name] = id;
 
-                    layerParams.set("pad_h", paddings.at<int32_t>(4));
-                    layerParams.set("pad_w", paddings.at<int32_t>(6));
+    connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+}
 
-                    layers_to_ignore.insert(next_layers[0].first);
+void TFImporter::parseMaxPoolGrad(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-                    // FIXIT don't override, rewrite this code
-                    layer = net.node(next_layers[0].second);
-                    name = layer.name();
-                    type = layer.op();
-                    num_inputs = layer.input_size();
-                    CV_LOG_DEBUG(NULL, "DNN/TF:     switched to layer " << name << " @ " << type << ") with " << num_inputs << " inputs");
-                }
-            }
+    CV_CheckEQ(num_inputs, 3, "");
 
-            // For the object detection networks, TensorFlow Object Detection API
-            // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
-            // order. We can manage it at DetectionOutput layer parsing predictions
-            // or shuffle last convolution's weights.
-            bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
-                                     getLayerAttr(layer, "loc_pred_transposed").b();
+    layerParams.set("pool_k_h", 0);
+    layerParams.set("pool_k_w", 0);
+    layerParams.set("pool_stride_h", 0);
+    layerParams.set("pool_stride_w", 0);
+    layerParams.set("pool_pad_h", 0);
+    layerParams.set("pool_pad_w", 0);
 
-            layerParams.set("bias_term", false);
-            layerParams.blobs.resize(1);
+    int id = dstNet.addLayer(name, "MaxUnpool", layerParams);
+    layer_id[name] = id;
 
-            next_layers = getNextLayers(net, name, "BiasAdd");
-            if (next_layers.size() == 1) {
-                layerParams.set("bias_term", true);
-                layerParams.blobs.resize(2);
+    connect(layer_id, dstNet, parsePin(layer.input(2)), id, 0);
+    connect(layer_id, dstNet, parsePin(layer.input(1) + ":1"), id, 1);
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 2);
+}
 
-                int weights_layer_index = next_layers[0].second;
+void TFImporter::parsePlaceholder(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
 
-                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
-                ExcludeLayer(net, weights_layer_index, 0, false);
-                layers_to_ignore.insert(next_layers[0].first);
+    DataLayout predictedLayout = data_layouts[name];
 
-                // Shuffle bias from yxYX to xyXY.
-                if (locPredTransposed)
-                {
-                    const int numWeights = layerParams.blobs[1].total();
-                    float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
-                    CV_Assert(numWeights % 4 == 0);
-                    for (int i = 0; i < numWeights; i += 2)
-                    {
-                        std::swap(biasData[i], biasData[i + 1]);
-                    }
-                }
-            }
+    if (!hasLayerAttr(layer, "dtype") ||
+        getLayerAttr(layer, "dtype").type() != tensorflow::DT_BOOL)  // If input is not a train/test flag.
+    {
+        netInputsNames.push_back(name);
+        layer_id[name] = 0;
+    }
+    tensorflow::TensorShapeProto shape;
+    if (hasLayerAttr(layer, "shape"))
+        shape = getLayerAttr(layer, "shape").shape();
+    else if (hasLayerAttr(layer, "_output_shapes"))
+    {
+        tensorflow::AttrValue_ListValue list = getLayerAttr(layer, "_output_shapes").list();
+        if (list.shape_size())
+            shape = list.shape()[0];
+    }
+    if (shape.dim_size())
+    {
+        MatShape dims(shape.dim_size());
+        for (int i = 0; i < dims.size(); ++i)
+            dims[i] = shape.dim(i).size();
+        if (dims.size() == 4 && predictedLayout == DATA_LAYOUT_NHWC)
+        {
+            std::swap(dims[1], dims[3]);  // NHWC->NCWH
+            std::swap(dims[2], dims[3]);  // NCWH->NCHW
+            if (dims[0] == -1)  // It's OK to have undetermined batch size
+                dims[0] = 1;
+        }
+        bool hasNeg = false;
+        for (int i = 0; i < dims.size() && !hasNeg; ++i)
+        {
+            hasNeg = dims[i] < 0;
+        }
+        if (!hasNeg)
+            netInputShapes.push_back(dims);
+    }
+}
 
-            int kernelTensorInpId = -1;
-            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernelTensorInpId);
-            const String kernelTensorName = layer.input(kernelTensorInpId);
-            std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
-            if (sharedWeightsIt == sharedWeights.end())
-            {
-                kernelFromTensor(kernelTensor, layerParams.blobs[0]);
-                releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
-
-                int* kshape = layerParams.blobs[0].size.p;
-                const int outCh = kshape[0];
-                const int inCh = kshape[1];
-                const int height = kshape[2];
-                const int width = kshape[3];
-                if (type == "DepthwiseConv2dNative")
-                {
-                    CV_Assert(!locPredTransposed);
-                    const int chMultiplier = kshape[0];
-
-                    Mat copy = layerParams.blobs[0].clone();
-                    float* src = (float*)copy.data;
-                    float* dst = (float*)layerParams.blobs[0].data;
-                    for (int i = 0; i < chMultiplier; ++i)
-                        for (int j = 0; j < inCh; ++j)
-                            for (int s = 0; s < height * width; ++s)
-                                {
-                                    int src_i = (i * inCh + j) * height * width + s;
-                                    int dst_i = (j * chMultiplier + i) * height* width + s;
-                                    dst[dst_i] = src[src_i];
-                                }
-                    // TODO Use reshape instead
-                    kshape[0] = inCh * chMultiplier;
-                    kshape[1] = 1;
-                    size_t* kstep = layerParams.blobs[0].step.p;
-                    kstep[0] = kstep[1]; // fix steps too
-                }
+void TFImporter::parseSplit(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // TODO: determining axis index remapping by input dimensions order of input blob
+    // TODO: slicing input may be Const op
+    // TODO: slicing kernels for convolutions - in current implementation it is impossible
+    // TODO: add parsing num of slices parameter
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 2, "");
+    // num_split
+    // 1st blob is dims tensor
+    int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
+    if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+        axis = toNCHW(axis);
+    layerParams.set("axis", axis);
+
+    if (hasLayerAttr(layer, "num_split"))
+        layerParams.set("num_split", getLayerAttr(layer, "num_split").i());
+
+    int id = dstNet.addLayer(name, "Slice", layerParams);
+    layer_id[name] = id;
+
+    // one input only
+    connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
+}
 
-                // Shuffle output channels from yxYX to xyXY.
-                if (locPredTransposed)
-                {
-                    const int slice = height * width * inCh;
-                    for (int i = 0; i < outCh; i += 2)
-                    {
-                        cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i));
-                        cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr<float>(i + 1));
-                        std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
-                    }
-                }
-                sharedWeights[kernelTensorName] = layerParams.blobs[0];
-            }
-            else
-            {
-                layerParams.blobs[0] = sharedWeightsIt->second;
-            }
-            Mat weights = layerParams.blobs[0];
-            layerParams.set("kernel_size",  DictValue::arrayInt(&weights.size[2], weights.dims - 2));
+void TFImporter::parseSlice(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "Slice"
+    // input: "input_node"
+    // input: "Slice/begin"
+    // input: "Slice/size"
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 3, "");
+    Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
+    Mat sizes = getTensorContent(getConstBlob(layer, value_id, 2));
+    CV_Assert_N(!begins.empty(), !sizes.empty());
+    CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
+    CV_CheckTypeEQ(sizes.type(), CV_32SC1, "");
+
+    if (begins.total() == 4 && getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+    {
+        // Swap NHWC parameters' order to NCHW.
+        std::swap(*begins.ptr<int32_t>(0, 2), *begins.ptr<int32_t>(0, 3));
+        std::swap(*begins.ptr<int32_t>(0, 1), *begins.ptr<int32_t>(0, 2));
+        std::swap(*sizes.ptr<int32_t>(0, 2), *sizes.ptr<int32_t>(0, 3));
+        std::swap(*sizes.ptr<int32_t>(0, 1), *sizes.ptr<int32_t>(0, 2));
+    }
+    layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
+    layerParams.set("size", DictValue::arrayInt((int*)sizes.data, sizes.total()));
 
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
+    int id = dstNet.addLayer(name, "Slice", layerParams);
+    layer_id[name] = id;
 
-            setStrides(layerParams, layer);
-            if (!layerParams.has("pad_w") && !layerParams.has("pad_h"))
-                setPadding(layerParams, layer);
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+}
 
-            // The final node of dilated convolution subgraph.
-            next_layers = getNextLayers(net, name, "BatchToSpaceND");
-            if (!next_layers.empty())
-            {
-                CV_Assert(next_layers.size() == 1);
-                ExcludeLayer(net, next_layers[0].second, 0, false);
-                layers_to_ignore.insert(next_layers[0].first);
-            }
+void TFImporter::parseStridedSlice(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 4, "");
+    Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
+    Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
+    Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
+    CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
+    CV_CheckTypeEQ(ends.type(), CV_32SC1, "");
+    CV_CheckTypeEQ(strides.type(), CV_32SC1, "");
+    const int num = begins.total();
+    CV_Assert_N(num == ends.total(), num == strides.total());
+
+    int end_mask = getLayerAttr(layer, "end_mask").i();
+    for (int i = 0; i < num; ++i)
+    {
+        if (ends.at<int>(i) < 0)
+            ends.at<int>(i) -= 1;
+        if (end_mask & (1 << i))
+            ends.at<int>(i) = -1;
+        if (strides.at<int>(i) != 1)
+            CV_Error(Error::StsNotImplemented,
+                     format("StridedSlice with stride %d", strides.at<int>(i)));
+    }
+    if (begins.total() == 4 && getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+    {
+        // Swap NHWC parameters' order to NCHW.
+        std::swap(begins.at<int>(2), begins.at<int>(3));
+        std::swap(begins.at<int>(1), begins.at<int>(2));
+        std::swap(ends.at<int>(2), ends.at<int>(3));
+        std::swap(ends.at<int>(1), ends.at<int>(2));
+    }
+    layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
+    layerParams.set("end", DictValue::arrayInt((int*)ends.data, ends.total()));
 
-            int id = dstNet.addLayer(name, "Convolution", layerParams);
-            layer_id[name] = id;
+    int id = dstNet.addLayer(name, "Slice", layerParams);
+    layer_id[name] = id;
 
-            // one input only
-            connect(layer_id, dstNet, parsePin(input), id, 0);
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+}
 
+// "Mul" "RealDiv"
+void TFImporter::parseMul(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
 
-            if (getDataLayout(name, data_layouts) == DATA_LAYOUT_UNKNOWN)
-                data_layouts[name] = DATA_LAYOUT_NHWC;
+    CV_CheckGT(num_inputs, 0, "");
+    int constId = -1;
+    for(int ii = 0; ii < num_inputs; ++ii)
+    {
+        Pin input = parsePin(layer.input(ii));
+        if (value_id.find(input.name) != value_id.end())
+        {
+            constId = ii;
+            break;
+        }
+    }
+    CV_Assert((constId != -1) || (num_inputs == 2));
+
+    if (constId != -1)
+    {
+        // Multiplication by constant.
+        CV_CheckEQ(num_inputs, 2, "");
+        Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
+        CV_Assert(scaleMat.type() == CV_32FC1);
+        if (type == "RealDiv")
+        {
+            if (constId == 0)
+                CV_Error(Error::StsNotImplemented, "Division of constant over variable");
+            scaleMat = 1.0f / scaleMat;
         }
-        else if (type == "BiasAdd" || type == "Add" || type == "AddV2" || type == "Sub" || type=="AddN")
+
+        int id;
+        if (scaleMat.total() == 1)  // is a scalar.
         {
-            CV_CheckGT(num_inputs, 0, "");
-            bool haveConst = false;
-            for(int ii = 0; !haveConst && ii < num_inputs; ++ii)
+            // Try to match with a LeakyRelu:
+            // node {
+            //   name: "LeakyRelu/mul"
+            //   op: "Mul"
+            //   input: "LeakyRelu/alpha"
+            //   input: "input"
+            // }
+            // node {
+            //   name: "LeakyRelu/Maximum"
+            //   op: "Maximum"
+            //   input: "LeakyRelu/mul"
+            //   input: "input"
+            // }
+            StrIntVector next_layers = getNextLayers(net, name, "Maximum");
+            if (!next_layers.empty())
             {
-                Pin input = parsePin(layer.input(ii));
-                haveConst = value_id.find(input.name) != value_id.end();
-            }
-            CV_Assert(!haveConst || num_inputs == 2);
+                int maximumLayerIdx = next_layers[0].second;
 
-            if (haveConst)
-            {
-                Mat values = getTensorContent(getConstBlob(layer, value_id));
-                CV_Assert(values.type() == CV_32FC1);
-                if (type == "Sub")
-                    values *= -1.0f;
+                CV_Assert(net.node(maximumLayerIdx).input_size() == 2);
 
-                int id;
-                if (values.total() == 1)  // is a scalar.
-                {
-                    layerParams.set("shift", values.at<float>(0));
-                    id = dstNet.addLayer(name, "Power", layerParams);
-                }
-                else  // is a vector
-                {
-                    layerParams.blobs.resize(1, values);
-                    id = dstNet.addLayer(name, "Shift", layerParams);
-                }
-                layer_id[name] = id;
+                // The input from the Mul layer can also be at index 1.
+                int mulInputIdx = (net.node(maximumLayerIdx).input(0) == name) ? 0 : 1;
+
+                ExcludeLayer(net, maximumLayerIdx, mulInputIdx, false);
+                layers_to_ignore.insert(next_layers[0].first);
 
-                // one input only
-                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+                layerParams.set("negative_slope", scaleMat.at<float>(0));
+                id = dstNet.addLayer(name, "ReLU", layerParams);
             }
             else
             {
-                layerParams.set("operation", "sum");
-                if (type == "Sub")
-                {
-                    static float subCoeffs[] = {1.f, -1.f};
-                    layerParams.set("coeff", DictValue::arrayReal<float*>(subCoeffs, 2));
-                }
-
-                int id = dstNet.addLayer(name, "Eltwise", layerParams);
-                layer_id[name] = id;
-
-                for (int ii = 0; ii < num_inputs; ii++)
-                {
-                    Pin inp = parsePin(layer.input(ii));
-                    if (layer_id.find(inp.name) == layer_id.end())
-                        CV_Error(Error::StsError, "Input layer not found: " + inp.name);
-                    connect(layer_id, dstNet, inp, id, ii);
-                }
+                // Just a multiplication.
+                layerParams.set("scale", scaleMat.at<float>(0));
+                id = dstNet.addLayer(name, "Power", layerParams);
             }
         }
-        else if (type == "MatMul")
+        else  // is a vector
         {
-            CV_CheckEQ(num_inputs, 2, "");
-
-            // For the object detection networks, TensorFlow Object Detection API
-            // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax)
-            // order. We can manage it at DetectionOutput layer parsing predictions
-            // or shuffle last Faster-RCNN's matmul weights.
-            bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") &&
-                                     getLayerAttr(layer, "loc_pred_transposed").b();
+            layerParams.blobs.resize(1, scaleMat);
 
-            layerParams.set("bias_term", false);
-            layerParams.blobs.resize(1);
-
-            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");  // FIXIT Use layers fusion instead
-            if (next_layers.empty())
+            StrIntVector next_layers = getNextLayers(net, name, "Add");
+            if (!next_layers.empty())
             {
-                next_layers = getNextLayers(net, name, "Add");
-            }
-            if (next_layers.size() == 1) {
                 layerParams.set("bias_term", true);
                 layerParams.blobs.resize(2);
 
                 int weights_layer_index = next_layers[0].second;
-                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
+                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs.back());
                 ExcludeLayer(net, weights_layer_index, 0, false);
                 layers_to_ignore.insert(next_layers[0].first);
-
-                if (locPredTransposed)
-                {
-                    const int numWeights = layerParams.blobs[1].total();
-                    float* biasData = reinterpret_cast<float*>(layerParams.blobs[1].data);
-                    CV_Assert(numWeights % 4 == 0);
-                    for (int i = 0; i < numWeights; i += 2)
-                    {
-                        std::swap(biasData[i], biasData[i + 1]);
-                    }
-                }
-            }
-
-            int kernel_blob_index = -1;
-            const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id, -1, &kernel_blob_index);
-            const String kernelTensorName = layer.input(kernel_blob_index);
-            std::map<String, Mat>::iterator sharedWeightsIt = sharedWeights.find(kernelTensorName);
-            if (sharedWeightsIt == sharedWeights.end())
-            {
-                blobFromTensor(kernelTensor, layerParams.blobs[0]);
-                releaseTensor(const_cast<tensorflow::TensorProto*>(&kernelTensor));
-                sharedWeights[kernelTensorName] = layerParams.blobs[0];
-            }
-            else
-            {
-                layerParams.blobs[0] = sharedWeightsIt->second;
-            }
-
-            if (kernel_blob_index == 1) { // In this case output is computed by x*W formula - W should be transposed
-                Mat data = layerParams.blobs[0].t();
-                layerParams.blobs[0] = data.clone();
-            }
-
-            layerParams.set("num_output", layerParams.blobs[0].size[0]);
-            if (locPredTransposed)
-            {
-                CV_Assert(layerParams.blobs[0].dims == 2);
-                for (int i = 0; i < layerParams.blobs[0].size[0]; i += 2)
-                {
-                    cv::Mat src = layerParams.blobs[0].row(i);
-                    cv::Mat dst = layerParams.blobs[0].row(i + 1);
-                    std::swap_ranges(src.begin<float>(), src.end<float>(), dst.begin<float>());
-                }
             }
 
-            int id = dstNet.addLayer(name, "InnerProduct", layerParams);
-            layer_id[name] = id;
+            if (hasLayerAttr(layer, "axis"))
+                layerParams.set("axis", getLayerAttr(layer, "axis").i());
 
-            // one input only
-            int input_blob_index = kernel_blob_index == 0 ? 1 : 0;
-            connect(layer_id, dstNet, parsePin(layer.input(input_blob_index)), id, 0);
-            data_layouts[name] = DATA_LAYOUT_PLANAR;
+            id = dstNet.addLayer(name, "Scale", layerParams);
         }
-        else if (type == "Reshape")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            Pin inpId = parsePin(layer.input(0));
-            DataLayout inpLayout = getDataLayout(layer.input(0), data_layouts);
-            // There are two possible implementations: reshape an input using
-            // predefined sizes or use a second input blob as a source of new shape.
-            if (value_id.find(layer.input(1)) != value_id.end())
-            {
-                Mat newShape = getTensorContent(getConstBlob(layer, value_id, 1));
-                int newShapeSize = newShape.total();
-                bool hasSwap = false;
-                if (newShapeSize == 4 && hasAllOnes(newShape, 0, 2))
-                {
-                    // NHWC->NCHW
-                    std::swap(*newShape.ptr<int32_t>(0, 2), *newShape.ptr<int32_t>(0, 3));
-                    std::swap(*newShape.ptr<int32_t>(0, 1), *newShape.ptr<int32_t>(0, 2));
-                    hasSwap = true;
-                }
-                if (inpLayout == DATA_LAYOUT_NHWC)
-                {
-                    if (newShapeSize >= 2 || newShape.at<int>(1) == 1)
-                    {
-                        int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        addPermuteLayer(order, name + "/nhwc", inpId);
-                        if (newShapeSize < 4)
-                        {
-                            inpLayout = DATA_LAYOUT_NCHW;
-                        }
-                        else
-                        {
-                            inpLayout = DATA_LAYOUT_NHWC;
-                        }
-                    }
-                }
-                layerParams.set("dim", DictValue::arrayInt<int*>(newShape.ptr<int>(), newShapeSize));
-
-                int id = dstNet.addLayer(name, "Reshape", layerParams);
-                layer_id[name] = id;
+        layer_id[name] = id;
 
-                // one input only
-                connect(layer_id, dstNet, inpId, id, 0);
-                inpId = Pin(name);
+        Pin inp0 = parsePin(layer.input(0));
+        if (layer_id.find(inp0.name) != layer_id.end())
+            // First operand is a constant.
+            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+        else
+            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
+    }
+    else
+    {
+        // Check if all the inputs have the same shape.
+        bool equalInpShapes = true;
+        bool isShapeOnes = false;
+        MatShape outShape0;
+        for (int ii = 0; ii < num_inputs && !netInputShapes.empty(); ii++)
+        {
+            Pin pin = parsePin(layer.input(ii));
+            int inpId = layer_id.find(pin.name)->second;
 
-                if ((inpLayout == DATA_LAYOUT_NHWC || inpLayout == DATA_LAYOUT_UNKNOWN || inpLayout == DATA_LAYOUT_PLANAR) &&
-                    newShapeSize == 4 && !hasSwap)
-                {
-                    int order[] = {0, 3, 1, 2};  // Transform back to OpenCV's NCHW.
-                    addPermuteLayer(order, name + "/nchw", inpId);
-                    inpLayout = DATA_LAYOUT_NCHW;
-                }
+            // Get input shape
+            MatShape outShape;
+            std::vector<MatShape> inpShapes, outShapes;
+            dstNet.getLayerShapes(netInputShapes, inpId, inpShapes, outShapes);
+            CV_CheckGT(static_cast<int>(outShapes.size()), pin.blobIndex, "");
+            outShape = outShapes[pin.blobIndex];
 
-                data_layouts[name] = newShapeSize == 2 ? DATA_LAYOUT_PLANAR : inpLayout;
-            }
-            else
+            if (ii == 0)
             {
-                int id = dstNet.addLayer(name, "Reshape", layerParams);
-                layer_id[name] = id;
-                connect(layer_id, dstNet, inpId, id, 0);
-                connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
-                data_layouts[name] = inpLayout;
+                outShape0 = outShape;
             }
-        }
-        else if (type == "Flatten" || type == "Squeeze")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            Pin inpId = parsePin(layer.input(0));
-            int inpLayout = getDataLayout(layer.input(0), data_layouts);
-            if (type == "Squeeze")
+            else if (outShape != outShape0)
             {
-                CV_Assert(hasLayerAttr(layer, "squeeze_dims"));
-                const tensorflow::AttrValue& dims = getLayerAttr(layer, "squeeze_dims");
-                std::vector<int> dimsVector(dims.list().i_size());
-                for (int i = 0; i < dimsVector.size(); ++i)
-                    dimsVector[i] = dims.list().i(i);
-
-                // Flatten layer can squeeze dimensions range into one.
-                std::sort(dimsVector.begin(), dimsVector.end());
-                for (int i = 1; i < dimsVector.size(); ++i)
-                {
-                    if (dimsVector[i] != dimsVector[i - 1] + 1)
-                        CV_Error(Error::StsNotImplemented, "Unsupported squeeze configuration");
-                }
-                int start = dimsVector.front() - 1, end = dimsVector.back();
-                if (start == -1 && end == 0)  // squeeze 0th dimension
-                {
-                    start = 0;
-                    end = 1;
-                }
-                layerParams.set("axis", start);
-                layerParams.set("end_axis", end);
+                equalInpShapes = false;
+                isShapeOnes = isAllOnes(outShape, 2, outShape.size()) ||
+                              isAllOnes(outShape0, 2, outShape0.size());
+                break;
             }
-            if (inpLayout == DATA_LAYOUT_NHWC)
-            {
-                LayerParams permLP;
-                int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                permLP.set("order", DictValue::arrayInt<int*>(order, 4));
+        }
 
-                std::string permName = name + "/nchw";
-                CV_Assert(layer_id.find(permName) == layer_id.end());
-                int permId = dstNet.addLayer(permName, "Permute", permLP);
-                layer_id[permName] = permId;
-                connect(layer_id, dstNet, inpId, permId, 0);
-                inpId = Pin(permName);
-            }
-            int id = dstNet.addLayer(name, "Flatten", layerParams);
-            layer_id[name] = id;
-            connect(layer_id, dstNet, inpId, id, 0);
-            data_layouts[name] = DATA_LAYOUT_PLANAR;
+        int id;
+        if (equalInpShapes || netInputShapes.empty() || (!equalInpShapes && isShapeOnes))
+        {
+            layerParams.set("operation", type == "RealDiv" ? "div" : "prod");
+            id = dstNet.addLayer(name, "Eltwise", layerParams);
         }
-        else if (type == "Transpose")
+        else
         {
-            CV_CheckGT(num_inputs, 0, "");
-            Mat perm = getTensorContent(getConstBlob(layer, value_id, 1));
-            CV_Assert(perm.type() == CV_32SC1);
-            int* permData = (int*)perm.data;
-            if (perm.total() == 4)
-            {
-                // Only NHWC <-> NCHW permutations are allowed. OpenCV is always
-                // keep NCHW layout this way.
-                int inpLayout = getDataLayout(layer.input(0), data_layouts);
-                std::string type = "Identity";
-                if (inpLayout == DATA_LAYOUT_NHWC)
-                {
-                    if (permData[0] == 0 && permData[1] == 3 && permData[2] == 1 && permData[3] == 2)
-                    {
-                        // in TensorFlow: NHWC->NCHW
-                        // in OpenCV: NCHW->NCHW
-                        data_layouts[name] = DATA_LAYOUT_NCHW;
-                    }
-                    else if (permData[0] == 0 && permData[1] == 1 && permData[2] == 2 && permData[3] == 3)
-                    {
-                        // in TensorFlow: NHWC->NHWC
-                        // in OpenCV: NCHW->NCHW
-                        data_layouts[name] = DATA_LAYOUT_NHWC;
-                    }
-                    else if (permData[0] == 0 && permData[1] == 3 && permData[2] == 2 && permData[3] == 1)
-                    {
-                        // in TensorFlow: NHWC->NCWH
-                        // in OpenCV: NCHW->NCWH
-                        int permData[] = {0, 1, 3, 2};
-                        layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
-                        data_layouts[name] = DATA_LAYOUT_NCHW;  // we keep track NCHW because channels position only matters
-                        type = "Permute";
-                    }
-                    else
-                        CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
-                }
-                else if (inpLayout == DATA_LAYOUT_NCHW)
-                {
-                    if (permData[0] == 0 && permData[1] == 2 && permData[2] == 3 && permData[3] == 1)
-                    {
-                        // in TensorFlow: NCHW->NHWC
-                        // in OpenCV: NCHW->NCHW
-                        data_layouts[name] = DATA_LAYOUT_NHWC;
-                    }
-                    else if (permData[0] == 0 && permData[1] == 1 && permData[2] == 2 && permData[3] == 3)
-                    {
-                        // in TensorFlow: NCHW->NCHW
-                        // in OpenCV: NCHW->NCHW
-                        data_layouts[name] = DATA_LAYOUT_NCHW;
-                    }
-                    else
-                        CV_Error(Error::StsParseError, "Only NHWC <-> NCHW permutations are allowed.");
-                }
-                int id = dstNet.addLayer(name, type, layerParams);
-                layer_id[name] = id;
-                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-            }
-            else
-            {
-                layerParams.set("order", DictValue::arrayInt<int*>(permData, perm.total()));
+            if (type == "RealDiv")
+                CV_Error(Error::StsNotImplemented, "Division of non equal tensors");
+            id = dstNet.addLayer(name, "Scale", layerParams);
+        }
 
-                int id = dstNet.addLayer(name, "Permute", layerParams);
-                layer_id[name] = id;
+        layer_id[name] = id;
 
-                // one input only
-                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-                data_layouts[name] = DATA_LAYOUT_UNKNOWN;
-            }
-        }
-        else if (type == "Const")
+        for (int ii = 0; ii < num_inputs; ii++)
         {
+            Pin inp = parsePin(layer.input(ii));
+            if (layer_id.find(inp.name) == layer_id.end())
+                CV_Error(Error::StsError, "Input layer not found: " + inp.name);
+            connect(layer_id, dstNet, inp, id, ii);
         }
-        else if (type == "LRN")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            if(hasLayerAttr(layer, "alpha")) {
-                layerParams.set("alpha", getLayerAttr(layer, "alpha").f());
-            }
-            if(hasLayerAttr(layer, "beta")) {
-                layerParams.set("beta", getLayerAttr(layer, "beta").f());
-            }
-            if(hasLayerAttr(layer, "depth_radius")) {
-                int radius = (int)getLayerAttr(layer, "depth_radius").i();
-                layerParams.set("local_size", 2*radius + 1);
-            }
-            if(hasLayerAttr(layer, "bias")) {
-                layerParams.set("bias", getLayerAttr(layer, "bias").f());
-            }
-            layerParams.set("norm_by_size", false);
+    }
+}
 
-            int id = dstNet.addLayer(name, "LRN", layerParams);
-            layer_id[name] = id;
+// "FusedBatchNorm" "FusedBatchNormV3"
+void TFImporter::parseFusedBatchNorm(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "FusedBatchNorm"
+    // input: "input"
+    // input: "BatchNorm/gamma"
+    // input: "BatchNorm/beta"
+    // input: "BatchNorm/moving_mean"
+    // input: "BatchNorm/moving_variance"
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
-        }
-        else if (type == "Concat" || type == "ConcatV2")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            int axisId = (type == "Concat" ? 0 : num_inputs - 1);
-            int axis = getConstBlob(layer, value_id, axisId).int_val().Get(0);
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-            if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
-                axis = toNCHW(axis);
-            else if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NDHWC)
-                axis = toNCDHW(axis);
-            layerParams.set("axis", axis);
+    CV_CheckEQ(num_inputs, 5, "Expected gamma, beta, mean and std");
+    Pin inpId = parsePin(layer.input(0));
 
-            // input(0) or input(n-1) is concat_dim
-            int from = (type == "Concat" ? 1 : 0);
-            int to = (type == "Concat" ? num_inputs : num_inputs - 1);
+    bool isTraining = hasLayerAttr(layer, "is_training") && getLayerAttr(layer, "is_training").b();
 
-            for (int ii = from; ii < to; ii++)
-            {
-                Pin inp = parsePin(layer.input(ii));
-                if (layer_id.find(inp.name) == layer_id.end())
-                {
-                    // There are constant inputs.
-                    LayerParams lp;
-                    lp.name = inp.name;
-                    lp.type = "Const";
-                    lp.blobs.resize(1);
-                    blobFromTensor(getConstBlob(layer, value_id, ii), lp.blobs.back());
-                    CV_Assert_N(!lp.blobs[0].empty(), lp.blobs[0].type() == CV_32F);
-
-                    int constInpId = dstNet.addLayer(lp.name, lp.type, lp);
-                    layer_id[lp.name] = constInpId;
-                }
-            }
+    layerParams.blobs.resize(2);
 
-            int id = dstNet.addLayer(name, "Concat", layerParams);
-            layer_id[name] = id;
+    const tensorflow::TensorProto& gammaTensor = getConstBlob(layer, value_id, 1);
+    if (!gammaTensor.tensor_content().empty())
+    {
+        layerParams.blobs.resize(layerParams.blobs.size() + 1);
+        layerParams.set("has_weight", true);
+        blobFromTensor(gammaTensor, layerParams.blobs.back());
+    }
+    else
+        layerParams.set("has_weight", false);
 
-            for (int ii = from; ii < to; ii++)
-            {
-                Pin inp = parsePin(layer.input(ii));
-                if (layer_id.find(inp.name) == layer_id.end())
-                    CV_Error(Error::StsError, "Input layer not found: " + inp.name);
-                connect(layer_id, dstNet, inp, id, ii - from);
-            }
-        }
-        else if (type == "MaxPool" || type == "MaxPool3D")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            layerParams.set("pool", "max");
+    const tensorflow::TensorProto& betaTensor = getConstBlob(layer, value_id, 2);
+    if (!betaTensor.tensor_content().empty())
+    {
+        layerParams.blobs.resize(layerParams.blobs.size() + 1);
+        layerParams.set("has_bias", true);
+        blobFromTensor(betaTensor, layerParams.blobs.back());
+    }
+    else
+        layerParams.set("has_bias", false);
 
-            setKSize(layerParams, layer);
-            setStrides(layerParams, layer);
-            setPadding(layerParams, layer);
-            // Test_TensorFlow_nets.EAST_text_detection/1, NGRAPH/CPU
-            layerParams.set("ceil_mode", false);
+    Mat mean, std;
+    if (isTraining)
+    {
+        if (layerParams.blobs.size() == 2)
+            CV_Error(Error::StsNotImplemented, "Cannot determine number "
+                                               "of parameters for batch normalization layer.");
+        mean = Mat::zeros(1, layerParams.blobs[2].total(), CV_32F);
+        std = Mat::ones(1, layerParams.blobs[2].total(), CV_32F);
+
+        // Add an extra layer: Mean-Variance normalization
+        LayerParams mvnParams;
+        std::string mvnName = name + "/MVN";
+        CV_Assert(layer_id.find(mvnName) == layer_id.end());
+        int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
+        layer_id[mvnName] = mvnId;
+        connect(layer_id, dstNet, inpId, mvnId, 0);
+        inpId = Pin(mvnName);
+    }
+    else
+    {
+        blobFromTensor(getConstBlob(layer, value_id, 3), mean);
+        blobFromTensor(getConstBlob(layer, value_id, 4), std);
+    }
+    layerParams.blobs[0] = mean;
+    layerParams.blobs[1] = std;
 
-            int id = dstNet.addLayer(name, "Pooling", layerParams);
-            layer_id[name] = id;
+    if (hasLayerAttr(layer, "epsilon"))
+        layerParams.set("eps", getLayerAttr(layer, "epsilon").f());
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
-        }
-        else if (type == "AvgPool" || type == "AvgPool3D")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            layerParams.set("pool", "ave");
-            layerParams.set("ave_pool_padded_area", false);
-            setKSize(layerParams, layer);
-            setStrides(layerParams, layer);
-            setPadding(layerParams, layer);
+    int id = dstNet.addLayer(name, "BatchNorm", layerParams);
+    layer_id[name] = id;
 
-            int id = dstNet.addLayer(name, "Pooling", layerParams);
-            layer_id[name] = id;
+    // one input only
+    connect(layer_id, dstNet, inpId, id, 0);
+}
 
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
-        }
-        else if (type == "MaxPoolGrad")
-        {
-            CV_CheckEQ(num_inputs, 3, "");
+void TFImporter::parseConv2DBackpropInput(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "Conv2DBackpropInput"
+    // input: "conv2d_transpose/output_shape"
+    // input: "weights"
+    // input: "input"
 
-            layerParams.set("pool_k_h", 0);
-            layerParams.set("pool_k_w", 0);
-            layerParams.set("pool_stride_h", 0);
-            layerParams.set("pool_stride_w", 0);
-            layerParams.set("pool_pad_h", 0);
-            layerParams.set("pool_pad_w", 0);
+    std::string name = layer.name();
+    const int num_inputs = layer.input_size();
 
-            int id = dstNet.addLayer(name, "MaxUnpool", layerParams);
-            layer_id[name] = id;
+    CV_CheckEQ(num_inputs, 3, "Expected output shape, weights and input nodes");
 
-            connect(layer_id, dstNet, parsePin(layer.input(2)), id, 0);
-            connect(layer_id, dstNet, parsePin(layer.input(1) + ":1"), id, 1);
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 2);
-        }
-        else if (type == "Placeholder")
+    layerParams.set("bias_term", false);
+    layerParams.blobs.resize(1);
+
+    StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");
+    if (next_layers.size() == 1)
+    {
+        layerParams.set("bias_term", true);
+        layerParams.blobs.resize(2);
+
+        int weights_layer_index = next_layers[0].second;
+
+        blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
+        ExcludeLayer(net, weights_layer_index, 0, false);
+        layers_to_ignore.insert(next_layers[0].first);
+    }
+
+    kernelFromTensor(getConstBlob(layer, value_id, 1), layerParams.blobs[0]);
+
+    const int* kshape = layerParams.blobs[0].size.p;
+    const int kernelH = kshape[2];
+    const int kernelW = kshape[3];
+    layerParams.set("kernel_h", kernelH);
+    layerParams.set("kernel_w", kernelW);
+    layerParams.set("num_output", kshape[1]);
+
+    setStrides(layerParams, layer);
+    setPadMode(layerParams, layer);
+    int64_t pads[8];
+    bool explicit_pads = getExplicitPadding(layerParams, layer, pads);
+    int64_t begs[4] = {};
+    int64_t ends[4] = {-1, -1, -1, -1};
+    if (explicit_pads)
+    {
+        name += "/deconv";
+        layerParams.set("pad_mode", "VALID");
+        for (int i = 2; i < 4; ++i) // begins=[0, 0, a, b], ends=[-1, -1, c, d]
         {
-            if (!hasLayerAttr(layer, "dtype") ||
-                getLayerAttr(layer, "dtype").type() != tensorflow::DT_BOOL)  // If input is not a train/test flag.
-            {
-                netInputsNames.push_back(name);
-                layer_id[name] = 0;
-            }
-            tensorflow::TensorShapeProto shape;
-            if (hasLayerAttr(layer, "shape"))
-                shape = getLayerAttr(layer, "shape").shape();
-            else if (hasLayerAttr(layer, "_output_shapes"))
-            {
-                tensorflow::AttrValue_ListValue list = getLayerAttr(layer, "_output_shapes").list();
-                if (list.shape_size())
-                    shape = list.shape()[0];
-            }
-            if (shape.dim_size())
-            {
-                MatShape dims(shape.dim_size());
-                for (int i = 0; i < dims.size(); ++i)
-                    dims[i] = shape.dim(i).size();
-                if (dims.size() == 4 && predictedLayout == DATA_LAYOUT_NHWC)
-                {
-                    std::swap(dims[1], dims[3]);  // NHWC->NCWH
-                    std::swap(dims[2], dims[3]);  // NCWH->NCHW
-                    if (dims[0] == -1)  // It's OK to have undetermined batch size
-                        dims[0] = 1;
-                }
-                bool hasNeg = false;
-                for (int i = 0; i < dims.size() && !hasNeg; ++i)
-                {
-                    hasNeg = dims[i] < 0;
-                }
-                if (!hasNeg)
-                    netInputShapes.push_back(dims);
-            }
+            begs[i] = pads[2*i];
+            ends[i] = -1 - pads[2*i + 1];
         }
-        else if (type == "Split") {
-            // TODO: determining axis index remapping by input dimensions order of input blob
-            // TODO: slicing input may be Const op
-            // TODO: slicing kernels for convolutions - in current implementation it is impossible
-            // TODO: add parsing num of slices parameter
-            CV_CheckEQ(num_inputs, 2, "");
-            // num_split
-            // 1st blob is dims tensor
-            int axis = getConstBlob(layer, value_id, 0).int_val().Get(0);
-            if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
-                axis = toNCHW(axis);
-            layerParams.set("axis", axis);
-
-            if (hasLayerAttr(layer, "num_split"))
-                layerParams.set("num_split", getLayerAttr(layer, "num_split").i());
-
-            int id = dstNet.addLayer(name, "Slice", layerParams);
-            layer_id[name] = id;
+    }
 
-            // one input only
-            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
-        }
-        else if (type == "Slice")
-        {
-            // op: "Slice"
-            // input: "input_node"
-            // input: "Slice/begin"
-            // input: "Slice/size"
-            CV_CheckEQ(num_inputs, 3, "");
-            Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
-            Mat sizes = getTensorContent(getConstBlob(layer, value_id, 2));
-            CV_Assert_N(!begins.empty(), !sizes.empty());
-            CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
-            CV_CheckTypeEQ(sizes.type(), CV_32SC1, "");
+    // For convolution layer, output shape computes as
+    // o = 1 + (i - k + 2*p) / s
+    // i - input size, o - output size, k - kernel size, p - pad, s - stride
+    // In TensorFlow, p == 0 is padMode == 'VALID' or p == (k - 1) / 2
+    // considering that k is odd.
+    // SAME:  o = 1 + (i - 1) / s
+    // VALID: o = 1 + i / s
+    // Deconvolution's layer output shape computes as
+    // SAME:  o = 1 + (i - 1)*s
+    // VALID: o = (i - 1)*s
+    // If output_shape differs from formulas above then adjust padding is applied.
+
+    const int strideY = layerParams.get<int>("stride_h");
+    const int strideX = layerParams.get<int>("stride_w");
+    Mat outShape = getTensorContent(getConstBlob(layer, value_id, 0));
+    int shift = (getDataLayout(layer) == DATA_LAYOUT_NCHW);
+    const int outH = outShape.at<int>(1 + shift) + begs[2] - 1 - ends[2];
+    const int outW = outShape.at<int>(2 + shift) + begs[3] - 1 - ends[3];
+    if (layerParams.get<String>("pad_mode") == "SAME")
+    {
+        layerParams.set("adj_w", (outW - 1) % strideX);
+        layerParams.set("adj_h", (outH - 1) % strideY);
+    }
+    else if (layerParams.get<String>("pad_mode") == "VALID")
+    {
+        layerParams.set("adj_w", (outW - kernelW) % strideX);
+        layerParams.set("adj_h", (outH - kernelH) % strideY);
+    }
+    int id = dstNet.addLayer(name, "Deconvolution", layerParams);
+    layer_id[name] = id;
 
-            if (begins.total() == 4 && getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
-            {
-                // Swap NHWC parameters' order to NCHW.
-                std::swap(*begins.ptr<int32_t>(0, 2), *begins.ptr<int32_t>(0, 3));
-                std::swap(*begins.ptr<int32_t>(0, 1), *begins.ptr<int32_t>(0, 2));
-                std::swap(*sizes.ptr<int32_t>(0, 2), *sizes.ptr<int32_t>(0, 3));
-                std::swap(*sizes.ptr<int32_t>(0, 1), *sizes.ptr<int32_t>(0, 2));
-            }
-            layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
-            layerParams.set("size", DictValue::arrayInt((int*)sizes.data, sizes.total()));
+    // one input only
+    connect(layer_id, dstNet, parsePin(layer.input(2)), id, 0);
+    if (explicit_pads) // If we have explicit paddings, remove extra data
+    {
+        layerParams.set("begin", DictValue::arrayInt(begs, sizeof(begs) / sizeof(begs[0])));
+        layerParams.set("end", DictValue::arrayInt(ends, sizeof(ends) / sizeof(ends[0])));
 
-            int id = dstNet.addLayer(name, "Slice", layerParams);
-            layer_id[name] = id;
+        int id = dstNet.addLayer(layer.name(), "Slice", layerParams);
+        layer_id[layer.name()] = id;
 
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-        }
-        else if (type == "StridedSlice")
+        connect(layer_id, dstNet, parsePin(name), id, 0);
+    }
+}
+
+void TFImporter::parseBlockLSTM(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "BlockLSTM"
+    // input: "lstm_block_wrapper/ToInt64/x"  (ignore, number of time stamps)
+    // input: "input"
+    // input: "lstm_block_wrapper/zeros"
+    // input: "lstm_block_wrapper/zeros"
+    // input: "lstm_block_wrapper/kernel"
+    // input: "lstm_block_wrapper/w_i_diag"
+    // input: "lstm_block_wrapper/w_f_diag"
+    // input: "lstm_block_wrapper/w_o_diag"
+    // input: "lstm_block_wrapper/bias"
+
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 9, "Unexpected number of input nodes");
+
+    if (hasLayerAttr(layer, "forget_bias"))
+        layerParams.set("forget_bias", getLayerAttr(layer, "forget_bias").f());
+
+    if (hasLayerAttr(layer, "forget_bias"))
+    {
+        float cellClip = getLayerAttr(layer, "cell_clip").f();
+        // Cell clip disabled if it's negative.
+        if (cellClip >= 0)
         {
-            CV_CheckEQ(num_inputs, 4, "");
-            Mat begins = getTensorContent(getConstBlob(layer, value_id, 1));
-            Mat ends = getTensorContent(getConstBlob(layer, value_id, 2));
-            Mat strides = getTensorContent(getConstBlob(layer, value_id, 3));
-            CV_CheckTypeEQ(begins.type(), CV_32SC1, "");
-            CV_CheckTypeEQ(ends.type(), CV_32SC1, "");
-            CV_CheckTypeEQ(strides.type(), CV_32SC1, "");
-            const int num = begins.total();
-            CV_Assert_N(num == ends.total(), num == strides.total());
+            layerParams.set("use_cell_clip", true);
+            layerParams.set("cell_clip", cellClip);
+        }
+    }
 
-            int end_mask = getLayerAttr(layer, "end_mask").i();
-            for (int i = 0; i < num; ++i)
-            {
-                if (ends.at<int>(i) < 0)
-                    ends.at<int>(i) -= 1;
-                if (end_mask & (1 << i))
-                    ends.at<int>(i) = -1;
-                if (strides.at<int>(i) != 1)
-                    CV_Error(Error::StsNotImplemented,
-                             format("StridedSlice with stride %d", strides.at<int>(i)));
-            }
-            if (begins.total() == 4 && getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
-            {
-                // Swap NHWC parameters' order to NCHW.
-                std::swap(begins.at<int>(2), begins.at<int>(3));
-                std::swap(begins.at<int>(1), begins.at<int>(2));
-                std::swap(ends.at<int>(2), ends.at<int>(3));
-                std::swap(ends.at<int>(1), ends.at<int>(2));
-            }
-            layerParams.set("begin", DictValue::arrayInt((int*)begins.data, begins.total()));
-            layerParams.set("end", DictValue::arrayInt((int*)ends.data, ends.total()));
+    Mat W, Wh, Wx, b, cs_prev, h_prev;
+    blobFromTensor(getConstBlob(layer, value_id, 4), W);
+    blobFromTensor(getConstBlob(layer, value_id, 8), b);
+    blobFromTensor(getConstBlob(layer, value_id, 2), cs_prev);
+    blobFromTensor(getConstBlob(layer, value_id, 3), h_prev);
+    const int outSize = W.cols / 4;
+
+    // IGFO->IFOG
+    float* weightData = (float*)W.data;
+    for (int i = 0; i < W.rows; ++i)
+        for (int j = 0; j < outSize; ++j)
+        {
+            std::swap(weightData[i * W.cols + 1 * outSize + j],
+                      weightData[i * W.cols + 2 * outSize + j]);
+            std::swap(weightData[i * W.cols + 2 * outSize + j],
+                      weightData[i * W.cols + 3 * outSize + j]);
+        }
+    Wx = W.rowRange(0, W.rows - outSize).t();
+    Wh = W.rowRange(W.rows - outSize, W.rows).t();
 
-            int id = dstNet.addLayer(name, "Slice", layerParams);
-            layer_id[name] = id;
+    layerParams.blobs.resize(5);
+    layerParams.blobs[0] = Wh;
+    layerParams.blobs[1] = Wx;
+    layerParams.blobs[2] = b;
+    layerParams.blobs[3] = h_prev;
+    layerParams.blobs[4] = cs_prev;
 
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-        }
-        else if (type == "Mul" || type == "RealDiv")
+    if (hasLayerAttr(layer, "use_peephole"))
+    {
+        bool usePeephole = getLayerAttr(layer, "use_peephole").b();
+        if (usePeephole)
         {
-            CV_CheckGT(num_inputs, 0, "");
-            int constId = -1;
-            for(int ii = 0; ii < num_inputs; ++ii)
+            layerParams.set("use_peephole", true);
+            layerParams.blobs.resize(8);
+            for (int i = 0; i < 3; ++i)
             {
-                Pin input = parsePin(layer.input(ii));
-                if (value_id.find(input.name) != value_id.end())
-                {
-                    constId = ii;
-                    break;
-                }
+                Mat w;
+                blobFromTensor(getConstBlob(layer, value_id, 5 + i), w);
+                w = w.reshape(1, w.total());  // Single column.
+                w = Mat::diag(w);  // Make a diagonal matrix.
+                layerParams.blobs[5 + i] = w;
             }
-            CV_Assert((constId != -1) || (num_inputs == 2));
-
-            if (constId != -1)
-            {
-                // Multiplication by constant.
-                CV_CheckEQ(num_inputs, 2, "");
-                Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
-                CV_Assert(scaleMat.type() == CV_32FC1);
-                if (type == "RealDiv")
-                {
-                    if (constId == 0)
-                        CV_Error(Error::StsNotImplemented, "Division of constant over variable");
-                    scaleMat = 1.0f / scaleMat;
-                }
+        }
+    }
 
-                int id;
-                if (scaleMat.total() == 1)  // is a scalar.
-                {
-                    // Try to match with a LeakyRelu:
-                    // node {
-                    //   name: "LeakyRelu/mul"
-                    //   op: "Mul"
-                    //   input: "LeakyRelu/alpha"
-                    //   input: "input"
-                    // }
-                    // node {
-                    //   name: "LeakyRelu/Maximum"
-                    //   op: "Maximum"
-                    //   input: "LeakyRelu/mul"
-                    //   input: "input"
-                    // }
-                    StrIntVector next_layers = getNextLayers(net, name, "Maximum");
-                    if (!next_layers.empty())
-                    {
-                        int maximumLayerIdx = next_layers[0].second;
+    int id = dstNet.addLayer(name, "LSTM", layerParams);
+    layer_id[name] = id;
 
-                        CV_Assert(net.node(maximumLayerIdx).input_size() == 2);
+    // one input only
+    connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
+    data_layouts[name] = DATA_LAYOUT_UNKNOWN;
+}
 
-                        // The input from the Mul layer can also be at index 1.
-                        int mulInputIdx = (net.node(maximumLayerIdx).input(0) == name) ? 0 : 1;
-
-                        ExcludeLayer(net, maximumLayerIdx, mulInputIdx, false);
-                        layers_to_ignore.insert(next_layers[0].first);
-
-                        layerParams.set("negative_slope", scaleMat.at<float>(0));
-                        id = dstNet.addLayer(name, "ReLU", layerParams);
-                    }
-                    else
-                    {
-                        // Just a multiplication.
-                        layerParams.set("scale", scaleMat.at<float>(0));
-                        id = dstNet.addLayer(name, "Power", layerParams);
-                    }
-                }
-                else  // is a vector
-                {
-                    layerParams.blobs.resize(1, scaleMat);
-
-                   StrIntVector next_layers = getNextLayers(net, name, "Add");
-                   if (!next_layers.empty())
-                   {
-                       layerParams.set("bias_term", true);
-                       layerParams.blobs.resize(2);
-
-                       int weights_layer_index = next_layers[0].second;
-                       blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs.back());
-                       ExcludeLayer(net, weights_layer_index, 0, false);
-                       layers_to_ignore.insert(next_layers[0].first);
-                   }
-
-                    if (hasLayerAttr(layer, "axis"))
-                        layerParams.set("axis", getLayerAttr(layer, "axis").i());
-
-                    id = dstNet.addLayer(name, "Scale", layerParams);
-                }
-                layer_id[name] = id;
-
-                Pin inp0 = parsePin(layer.input(0));
-                if (layer_id.find(inp0.name) != layer_id.end())
-                    // First operand is a constant.
-                    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-                else
-                    connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
-            }
-            else
-            {
-                // Check if all the inputs have the same shape.
-                bool equalInpShapes = true;
-                bool isShapeOnes = false;
-                MatShape outShape0;
-                for (int ii = 0; ii < num_inputs && !netInputShapes.empty(); ii++)
-                {
-                    Pin pin = parsePin(layer.input(ii));
-                    int inpId = layer_id.find(pin.name)->second;
-
-                    // Get input shape
-                    MatShape outShape;
-                    std::vector<MatShape> inpShapes, outShapes;
-                    dstNet.getLayerShapes(netInputShapes, inpId, inpShapes, outShapes);
-                    CV_CheckGT(static_cast<int>(outShapes.size()), pin.blobIndex, "");
-                    outShape = outShapes[pin.blobIndex];
+// "ResizeNearestNeighbor" "ResizeBilinear" "FusedResizeAndPadConv2D"
+void TFImporter::parseResize(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer_, LayerParams& layerParams)
+{
+    tensorflow::NodeDef layer = layer_;
+    std::string name = layer.name();
+    const std::string& type = layer.op();
+    int num_inputs = layer.input_size();
 
-                    if (ii == 0)
-                    {
-                        outShape0 = outShape;
-                    }
-                    else if (outShape != outShape0)
-                    {
-                        equalInpShapes = false;
-                        isShapeOnes = isAllOnes(outShape, 2, outShape.size()) ||
-                                      isAllOnes(outShape0, 2, outShape0.size());
-                        break;
-                    }
-                }
+    CV_CheckGT(num_inputs, 0, "");
+    std::string convWeights = "";
+    if (type == "FusedResizeAndPadConv2D")
+    {
+        // input: "mul_1"
+        // input: "decoder/ResizeBilinear/size"
+        // input: "decoder/decoder_conv0/Conv2D_dummy_paddings"
+        // input: "decoder/decoder_conv0/weights"
+        CV_CheckEQ(num_inputs, 4, "Number of input for FusedResizeAndPadConv2D");
 
-                int id;
-                if (equalInpShapes || netInputShapes.empty() || (!equalInpShapes && isShapeOnes))
-                {
-                    layerParams.set("operation", type == "RealDiv" ? "div" : "prod");
-                    id = dstNet.addLayer(name, "Eltwise", layerParams);
-                }
-                else
-                {
-                    if (type == "RealDiv")
-                        CV_Error(Error::StsNotImplemented, "Division of non equal tensors");
-                    id = dstNet.addLayer(name, "Scale", layerParams);
-                }
+        Mat paddings = getTensorContent(getConstBlob(layer, value_id, 2));
+        CV_CheckEQ(countNonZero(paddings), 0, "Unsupported mode");
 
-                layer_id[name] = id;
+        convWeights = layer.input(3);
+        layer.mutable_input()->DeleteSubrange(2, 2);  // FIXIT do NOT modify input model
+        num_inputs = layer.input_size();
+        name = name + "/resize";
 
-                for (int ii = 0; ii < num_inputs; ii++)
-                {
-                    Pin inp = parsePin(layer.input(ii));
-                    if (layer_id.find(inp.name) == layer_id.end())
-                        CV_Error(Error::StsError, "Input layer not found: " + inp.name);
-                    connect(layer_id, dstNet, inp, id, ii);
-                }
-            }
-        }
-        else if (type == "FusedBatchNorm" || type == "FusedBatchNormV3")
+        if (hasLayerAttr(layer, "resize_align_corners"))
         {
-            // op: "FusedBatchNorm"
-            // input: "input"
-            // input: "BatchNorm/gamma"
-            // input: "BatchNorm/beta"
-            // input: "BatchNorm/moving_mean"
-            // input: "BatchNorm/moving_variance"
-            CV_CheckEQ(num_inputs, 5, "Expected gamma, beta, mean and std");
-            Pin inpId = parsePin(layer.input(0));
-
-            bool isTraining = hasLayerAttr(layer, "is_training") && getLayerAttr(layer, "is_training").b();
+            // FIXIT do NOT modify input model
+            layer.mutable_attr()->insert(
+                    ::google::protobuf::MapPair<std::string, tensorflow::AttrValue>("align_corners",
+                                                                                    getLayerAttr(layer, "resize_align_corners")));
+        }
+    }
+    if (num_inputs == 2)
+    {
+        Mat outSize = getTensorContent(getConstBlob(layer, value_id, 1));
+        CV_CheckTypeEQ(outSize.type(), CV_32SC1, ""); CV_CheckEQ(outSize.total(), (size_t)2, "");
+        layerParams.set("height", outSize.at<int>(0, 0));
+        layerParams.set("width", outSize.at<int>(0, 1));
+    }
+    else if (num_inputs == 3)
+    {
+        Mat factorHeight = getTensorContent(getConstBlob(layer, value_id, 1));
+        Mat factorWidth = getTensorContent(getConstBlob(layer, value_id, 2));
+        factorHeight.convertTo(factorHeight, CV_32F);
+        factorWidth.convertTo(factorWidth, CV_32F);
+        layerParams.set("zoom_factor_x", factorWidth.at<float>(0));
+        layerParams.set("zoom_factor_y", factorHeight.at<float>(0));
+    }
+    else
+        CV_Check(num_inputs, num_inputs == 2 || num_inputs == 3, "");
 
-            layerParams.blobs.resize(2);
+    if (type == "ResizeNearestNeighbor")
+        layerParams.set("interpolation", "nearest");
+    else
+        layerParams.set("interpolation", "bilinear");
 
-            const tensorflow::TensorProto& gammaTensor = getConstBlob(layer, value_id, 1);
-            if (!gammaTensor.tensor_content().empty())
-            {
-                layerParams.blobs.resize(layerParams.blobs.size() + 1);
-                layerParams.set("has_weight", true);
-                blobFromTensor(gammaTensor, layerParams.blobs.back());
-            }
-            else
-                layerParams.set("has_weight", false);
+    if (hasLayerAttr(layer, "align_corners"))
+        layerParams.set("align_corners", getLayerAttr(layer, "align_corners").b());
 
-            const tensorflow::TensorProto& betaTensor = getConstBlob(layer, value_id, 2);
-            if (!betaTensor.tensor_content().empty())
-            {
-                layerParams.blobs.resize(layerParams.blobs.size() + 1);
-                layerParams.set("has_bias", true);
-                blobFromTensor(betaTensor, layerParams.blobs.back());
-            }
-            else
-                layerParams.set("has_bias", false);
+    if (hasLayerAttr(layer, "half_pixel_centers"))
+        layerParams.set("half_pixel_centers", getLayerAttr(layer, "half_pixel_centers").b());
 
-            Mat mean, std;
-            if (isTraining)
-            {
-                if (layerParams.blobs.size() == 2)
-                    CV_Error(Error::StsNotImplemented, "Cannot determine number "
-                             "of parameters for batch normalization layer.");
-                mean = Mat::zeros(1, layerParams.blobs[2].total(), CV_32F);
-                std = Mat::ones(1, layerParams.blobs[2].total(), CV_32F);
-
-                // Add an extra layer: Mean-Variance normalization
-                LayerParams mvnParams;
-                std::string mvnName = name + "/MVN";
-                CV_Assert(layer_id.find(mvnName) == layer_id.end());
-                int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
-                layer_id[mvnName] = mvnId;
-                connect(layer_id, dstNet, inpId, mvnId, 0);
-                inpId = Pin(mvnName);
-            }
-            else
-            {
-                blobFromTensor(getConstBlob(layer, value_id, 3), mean);
-                blobFromTensor(getConstBlob(layer, value_id, 4), std);
-            }
-            layerParams.blobs[0] = mean;
-            layerParams.blobs[1] = std;
+    int id = dstNet.addLayer(name, "Resize", layerParams);
+    layer_id[name] = id;
 
-            if (hasLayerAttr(layer, "epsilon"))
-                layerParams.set("eps", getLayerAttr(layer, "epsilon").f());
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
 
-            int id = dstNet.addLayer(name, "BatchNorm", layerParams);
-            layer_id[name] = id;
+    // Step back to add convolution
+    if (type == "FusedResizeAndPadConv2D")
+    {
+        tensorflow::NodeDef conv = layer_;
+        conv.clear_input();
+        conv.add_input(name);
+        conv.add_input(convWeights);
+        conv.set_op("Conv2D");
+        parseNode(conv);
+    }
+}
 
-            // one input only
-            connect(layer_id, dstNet, inpId, id, 0);
-        }
-        else if (type == "Conv2DBackpropInput")
-        {
-            // op: "Conv2DBackpropInput"
-            // input: "conv2d_transpose/output_shape"
-            // input: "weights"
-            // input: "input"
-            CV_CheckEQ(num_inputs, 3, "Expected output shape, weights and input nodes");
+void TFImporter::parseL2Normalize(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "L2Normalize"
+    // input: "input"
+    // input: "reduction_indices" (axis)
 
-            layerParams.set("bias_term", false);
-            layerParams.blobs.resize(1);
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-            StrIntVector next_layers = getNextLayers(net, name, "BiasAdd");
-            if (next_layers.size() == 1)
-            {
-                layerParams.set("bias_term", true);
-                layerParams.blobs.resize(2);
+    CV_CheckEQ(num_inputs, 2, "");
+    Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
+    CV_Assert(reductionIndices.type() == CV_32SC1);
 
-                int weights_layer_index = next_layers[0].second;
+    const int numAxes = reductionIndices.total();
+    if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
+        for (int i = 0; i < numAxes; ++i)
+            reductionIndices.at<int>(i) = toNCHW(reductionIndices.at<int>(i));
 
-                blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]);
-                ExcludeLayer(net, weights_layer_index, 0, false);
-                layers_to_ignore.insert(next_layers[0].first);
-            }
+    cv::sort(reductionIndices, reductionIndices, SORT_ASCENDING);
+    for (int i = 1; i < numAxes; ++i)
+    {
+        CV_Assert(reductionIndices.at<int>(i) == reductionIndices.at<int>(i - 1) + 1);
+        // Axes have the same sign.
+        CV_Assert(reductionIndices.at<int>(i) * reductionIndices.at<int>(i - 1) >= 0);
+    }
+    layerParams.set("start_axis", reductionIndices.at<int>(0));
+    layerParams.set("end_axis", reductionIndices.at<int>(numAxes - 1));
 
-            kernelFromTensor(getConstBlob(layer, value_id, 1), layerParams.blobs[0]);
-
-            const int* kshape = layerParams.blobs[0].size.p;
-            const int kernelH = kshape[2];
-            const int kernelW = kshape[3];
-            layerParams.set("kernel_h", kernelH);
-            layerParams.set("kernel_w", kernelW);
-            layerParams.set("num_output", kshape[1]);
-
-            setStrides(layerParams, layer);
-            setPadding(layerParams, layer);
-
-            // For convolution layer, output shape computes as
-            // o = 1 + (i - k + 2*p) / s
-            // i - input size, o - output size, k - kernel size, p - pad, s - stride
-            // In TensorFlow, p == 0 is padMode == 'VALID' or p == (k - 1) / 2
-            // considering that k is odd.
-            // SAME:  o = 1 + (i - 1) / s
-            // VALID: o = 1 + i / s
-            // Deconvolution's layer output shape computes as
-            // SAME:  o = 1 + (i - 1)*s
-            // VALID: o = (i - 1)*s
-            // If output_shape differs from formulas above then adjust padding is applied.
-
-            const int strideY = layerParams.get<int>("stride_h");
-            const int strideX = layerParams.get<int>("stride_w");
-            Mat outShape = getTensorContent(getConstBlob(layer, value_id, 0));
-            const int outH = outShape.at<int>(1);
-            const int outW = outShape.at<int>(2);
-            if (layerParams.get<String>("pad_mode") == "SAME")
-            {
-                layerParams.set("adj_w", (outW - 1) % strideX);
-                layerParams.set("adj_h", (outH - 1) % strideY);
-            }
-            else if (layerParams.get<String>("pad_mode") == "VALID")
-            {
-                layerParams.set("adj_w", (outW - kernelW) % strideX);
-                layerParams.set("adj_h", (outH - kernelH) % strideY);
-            }
-            int id = dstNet.addLayer(name, "Deconvolution", layerParams);
-            layer_id[name] = id;
+    int id = dstNet.addLayer(name, "Normalize", layerParams);
+    layer_id[name] = id;
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+}
 
-            // one input only
-            connect(layer_id, dstNet, parsePin(layer.input(2)), id, 0);
-        }
-        else if (type == "BlockLSTM")
+void TFImporter::parsePriorBox(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckEQ(num_inputs, 2, "");
+    if (hasLayerAttr(layer, "min_size"))
+        layerParams.set("min_size", getLayerAttr(layer, "min_size").i());
+    if (hasLayerAttr(layer, "max_size"))
+        layerParams.set("max_size", getLayerAttr(layer, "max_size").i());
+    if (hasLayerAttr(layer, "flip"))
+        layerParams.set("flip", getLayerAttr(layer, "flip").b());
+    if (hasLayerAttr(layer, "clip"))
+        layerParams.set("clip", getLayerAttr(layer, "clip").b());
+    if (hasLayerAttr(layer, "offset"))
+        layerParams.set("offset", getLayerAttr(layer, "offset").f());
+    if (hasLayerAttr(layer, "step"))
+        layerParams.set("step", getLayerAttr(layer, "step").f());
+
+    const std::string paramNames[] = {"variance", "aspect_ratio", "scales",
+                                      "width", "height"};
+    for (int i = 0; i < 5; ++i)
+    {
+        if (hasLayerAttr(layer, paramNames[i]))
         {
-            // op: "BlockLSTM"
-            // input: "lstm_block_wrapper/ToInt64/x"  (ignore, number of time stamps)
-            // input: "input"
-            // input: "lstm_block_wrapper/zeros"      (ignore)
-            // input: "lstm_block_wrapper/zeros"      (ignore)
-            // input: "lstm_block_wrapper/kernel"
-            // input: "lstm_block_wrapper/w_i_diag"
-            // input: "lstm_block_wrapper/w_f_diag"
-            // input: "lstm_block_wrapper/w_o_diag"
-            // input: "lstm_block_wrapper/bias"
-            CV_CheckEQ(num_inputs, 9, "Unexpected number of input nodes");
-
-            if (hasLayerAttr(layer, "forget_bias"))
-                layerParams.set("forget_bias", getLayerAttr(layer, "forget_bias").f());
+            Mat values = getTensorContent(getLayerAttr(layer, paramNames[i]).tensor());
+            layerParams.set(paramNames[i],
+                            DictValue::arrayReal<float*>((float*)values.data, values.total()));
+        }
+    }
+    int id = dstNet.addLayer(name, "PriorBox", layerParams);
+    layer_id[name] = id;
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+    connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
+    data_layouts[name] = DATA_LAYOUT_UNKNOWN;
+}
 
-            if (hasLayerAttr(layer, "forget_bias"))
-            {
-                float cellClip = getLayerAttr(layer, "cell_clip").f();
-                // Cell clip disabled if it's negative.
-                if (cellClip >= 0)
-                {
-                    layerParams.set("use_cell_clip", true);
-                    layerParams.set("cell_clip", cellClip);
-                }
-            }
+void TFImporter::parseSoftmax(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-            Mat W, Wh, Wx, b;
-            blobFromTensor(getConstBlob(layer, value_id, 4), W);
-            blobFromTensor(getConstBlob(layer, value_id, 8), b);
-            const int outSize = W.cols / 4;
+    CV_CheckGT(num_inputs, 0, "");
+    if (hasLayerAttr(layer, "axis"))
+        layerParams.set("axis", getLayerAttr(layer, "axis").i());
 
-            // IGFO->IFOG
-            float* weightData = (float*)W.data;
-            for (int i = 0; i < W.rows; ++i)
-                for (int j = 0; j < outSize; ++j)
-                {
-                    std::swap(weightData[i * W.cols + 1 * outSize + j],
-                              weightData[i * W.cols + 2 * outSize + j]);
-                    std::swap(weightData[i * W.cols + 2 * outSize + j],
-                              weightData[i * W.cols + 3 * outSize + j]);
-                }
-            Wx = W.rowRange(0, W.rows - outSize).t();
-            Wh = W.rowRange(W.rows - outSize, W.rows).t();
+    int id = dstNet.addLayer(name, "Softmax", layerParams);
+    layer_id[name] = id;
+    connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+}
 
-            layerParams.blobs.resize(3);
-            layerParams.blobs[0] = Wh;
-            layerParams.blobs[1] = Wx;
-            layerParams.blobs[2] = b;
+void TFImporter::parseCropAndResize(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "CropAndResize"
+    // input: "input"
+    // input: "boxes"
+    // input: "sizes"
 
-            if (hasLayerAttr(layer, "use_peephole"))
-            {
-                bool usePeephole = getLayerAttr(layer, "use_peephole").b();
-                if (usePeephole)
-                {
-                    layerParams.set("use_peephole", true);
-                    layerParams.blobs.resize(6);
-                    for (int i = 0; i < 3; ++i)
-                    {
-                        Mat w;
-                        blobFromTensor(getConstBlob(layer, value_id, 5 + i), w);
-                        w = w.reshape(1, w.total());  // Single column.
-                        w = Mat::diag(w);  // Make a diagonal matrix.
-                        layerParams.blobs[3 + i] = w;
-                    }
-                }
-            }
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+    CV_CheckEQ(num_inputs, 3, "");
 
-            int id = dstNet.addLayer(name, "LSTM", layerParams);
-            layer_id[name] = id;
+    Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
+    CV_CheckTypeEQ(cropSize.type(), CV_32SC1, ""); CV_CheckEQ(cropSize.total(), (size_t)2, "");
 
-            // one input only
-            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 0);
-            data_layouts[name] = DATA_LAYOUT_UNKNOWN;
-        }
-        else if (type == "ResizeNearestNeighbor" || type == "ResizeBilinear" || type == "FusedResizeAndPadConv2D")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            std::string convWeights = "";
-            if (type == "FusedResizeAndPadConv2D")
-            {
-                // input: "mul_1"
-                // input: "decoder/ResizeBilinear/size"
-                // input: "decoder/decoder_conv0/Conv2D_dummy_paddings"
-                // input: "decoder/decoder_conv0/weights"
-                CV_CheckEQ(num_inputs, 4, "Number of input for FusedResizeAndPadConv2D");
+    layerParams.set("height", cropSize.at<int>(0));
+    layerParams.set("width", cropSize.at<int>(1));
 
-                Mat paddings = getTensorContent(getConstBlob(layer, value_id, 2));
-                CV_CheckEQ(countNonZero(paddings), 0, "Unsupported mode");
+    int id = dstNet.addLayer(name, "CropAndResize", layerParams);
+    layer_id[name] = id;
 
-                convWeights = layer.input(3);
-                layer.mutable_input()->DeleteSubrange(2, 2);  // FIXIT do NOT modify input model
-                num_inputs = layer.input_size();
-                name = name + "/resize";
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+    connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
+}
 
-                if (hasLayerAttr(layer, "resize_align_corners"))
-                {
-                    // FIXIT do NOT modify input model
-                    layer.mutable_attr()->insert(
-                        ::google::protobuf::MapPair<std::string, tensorflow::AttrValue>("align_corners",
-                                                                                        getLayerAttr(layer, "resize_align_corners")));
-                }
-            }
-            if (num_inputs == 2)
-            {
-                Mat outSize = getTensorContent(getConstBlob(layer, value_id, 1));
-                CV_CheckTypeEQ(outSize.type(), CV_32SC1, ""); CV_CheckEQ(outSize.total(), (size_t)2, "");
-                layerParams.set("height", outSize.at<int>(0, 0));
-                layerParams.set("width", outSize.at<int>(0, 1));
-            }
-            else if (num_inputs == 3)
-            {
-                Mat factorHeight = getTensorContent(getConstBlob(layer, value_id, 1));
-                Mat factorWidth = getTensorContent(getConstBlob(layer, value_id, 2));
-                factorHeight.convertTo(factorHeight, CV_32F);
-                factorWidth.convertTo(factorWidth, CV_32F);
-                layerParams.set("zoom_factor_x", factorWidth.at<float>(0));
-                layerParams.set("zoom_factor_y", factorHeight.at<float>(0));
-            }
-            else
-                CV_Check(num_inputs, num_inputs == 2 || num_inputs == 3, "");
+// "Mean" "Sum" "Max"
+void TFImporter::parseMean(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // Computes the mean of elements across dimensions of a tensor.
+    // If keepdims is false (default) reduces input_tensor along the dimensions given in axis,
+    // else the reduced dimensions are retained with length 1.
+    // if indices = [1, 2] in NHWC layout we use global pooling: NxCxHxW --Pooling--> NxCx1x1
+    // if keepdims is false we use Flatten after Pooling: out_shape = NxC
+    // if indices = [0] we use a global pooling by indices.
+    // To return correct shape, we use Reshape after Pooling. To determine input shape use Slice for input,
+    // if keepdims is false we use Flatten after Slice.
+    // Example: input_shape = NxCxHxW
+    // determine out shape: NxCxHxW --Slice--> 1xCxHxW
+    //                      out_shape = 1xCxHxW if keepDims else (1xCxHxW --Flatten--> CxHxW)
+    // global pool: NxCxHxW --Flatten--> Nx(C*H*W) --Reshape--> 1x1xNx(C*H*W) --Pooling--> 1x1x1x(C*H*W) --Reshape--> out_shape
+
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
+    std::string pool_type = cv::toLowerCase(type);
+
+    if (pool_type == "mean")
+    {
+        pool_type = "ave";
+    }
+    CV_CheckGT(num_inputs, 0, "");
 
-            if (type == "ResizeNearestNeighbor")
-                layerParams.set("interpolation", "nearest");
-            else
-                layerParams.set("interpolation", "bilinear");
+    Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
+    CV_Assert(indices.type() == CV_32SC1);
 
-            if (hasLayerAttr(layer, "align_corners"))
-                layerParams.set("align_corners", getLayerAttr(layer, "align_corners").b());
+    // There are two attributes, "keepdims" and a deprecated "keep_dims".
+    bool keepDims = false;
+    if (hasLayerAttr(layer, "keepdims"))
+        keepDims = getLayerAttr(layer, "keepdims").b();
+    else if (hasLayerAttr(layer, "keep_dims"))
+        keepDims = getLayerAttr(layer, "keep_dims").b();
 
-            if (hasLayerAttr(layer, "half_pixel_centers"))
-                layerParams.set("half_pixel_centers", getLayerAttr(layer, "half_pixel_centers").b());
+    if (indices.total() == 1 && indices.at<int>(0) == 0)
+    {
+        LayerParams flattenLp;
+        std::string flattenName = name + "/flatten";
+        CV_Assert(layer_id.find(flattenName) == layer_id.end());
+        int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
+        layer_id[flattenName] = flattenId;
+        connect(layer_id, dstNet, parsePin(layer.input(0)), flattenId, 0);
+
+        LayerParams reshapeLp;
+        std::string reshapeName = name + "/reshape";
+        CV_Assert(layer_id.find(reshapeName) == layer_id.end());
+        reshapeLp.set("axis", 0);
+        reshapeLp.set("num_axes", 1);
+        int newShape[] = {1, 1, -1};
+        reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 3));
+
+        int reshapeId = dstNet.addLayer(reshapeName, "Reshape", reshapeLp);
+        layer_id[reshapeName] = reshapeId;
+        connect(layer_id, dstNet, Pin(flattenName), reshapeId, 0);
+
+        LayerParams avgLp;
+        std::string avgName = name + "/avg";
+        CV_Assert(layer_id.find(avgName) == layer_id.end());
+        avgLp.set("pool", pool_type);
+        // pooling kernel H x 1
+        avgLp.set("global_pooling_h", true);
+        avgLp.set("kernel_w", 1);
+        int avgId = dstNet.addLayer(avgName, "Pooling", avgLp);
+        layer_id[avgName] = avgId;
+        connect(layer_id, dstNet, Pin(reshapeName), avgId, 0);
+
+        LayerParams sliceLp;
+        std::string layerShapeName = name + "/slice";
+        CV_Assert(layer_id.find(layerShapeName) == layer_id.end());
+        sliceLp.set("axis", 0);
+        int begin[] = {0};
+        int size[] = {1};
+        sliceLp.set("begin", DictValue::arrayInt(&begin[0], 1));
+        sliceLp.set("size", DictValue::arrayInt(&size[0], 1));
+        int sliceId = dstNet.addLayer(layerShapeName, "Slice", sliceLp);
+        layer_id[layerShapeName] = sliceId;
+        connect(layer_id, dstNet, Pin(layer.input(0)), sliceId, 0);
+
+        if (!keepDims)
+        {
+            LayerParams squeezeLp;
+            std::string squeezeName = name + "/squeeze";
+            CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+            squeezeLp.set("axis", 0);
+            squeezeLp.set("end_axis", 1);
+            int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+            layer_id[squeezeName] = squeezeId;
+            connect(layer_id, dstNet, Pin(layerShapeName), squeezeId, 0);
+            layerShapeName = squeezeName;
+        }
 
-            int id = dstNet.addLayer(name, "Resize", layerParams);
+        int id = dstNet.addLayer(name, "Reshape", layerParams);
+        layer_id[name] = id;
+        connect(layer_id, dstNet, Pin(avgName), id, 0);
+        connect(layer_id, dstNet, Pin(layerShapeName), id, 1);
+    } else if (indices.total() == 1) {
+        int axis = toNCHW(indices.at<int>(0));
+        if (axis == 2 || axis == 3)
+        {
+            layerParams.set("pool", pool_type);
+            layerParams.set(axis == 2 ? "kernel_w" : "kernel_h", 1);
+            layerParams.set(axis == 2 ? "global_pooling_h" : "global_pooling_w", true);
+            int id = dstNet.addLayer(name, "Pooling", layerParams);
             layer_id[name] = id;
-
             connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
 
-            // Step back to add convolution
-            if (type == "FusedResizeAndPadConv2D")
+            if (!keepDims)
             {
-                tensorflow::NodeDef conv = layer_;
-                conv.clear_input();
-                conv.add_input(name);
-                conv.add_input(convWeights);
-                conv.set_op("Conv2D");
-                parseNode(conv);
+                // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
+                LayerParams permLP;
+                int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+                std::string permName = name + "/nchw";
+                Pin inpId = Pin(name);
+                addPermuteLayer(order, permName, inpId);
+
+                LayerParams squeezeLp;
+                std::string squeezeName = name + "/squeeze";
+                CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                squeezeLp.set("axis", indices.at<int>(0));
+                squeezeLp.set("end_axis", indices.at<int>(0) + 1);
+                int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+                layer_id[squeezeName] = squeezeId;
+                connect(layer_id, dstNet, Pin(permName), squeezeId, 0);
             }
         }
-        else if (type == "L2Normalize")
+        else if (axis == 1)
         {
-            // op: "L2Normalize"
-            // input: "input"
-            // input: "reduction_indices" (axis)
-            CV_CheckEQ(num_inputs, 2, "");
-            Mat reductionIndices = getTensorContent(getConstBlob(layer, value_id, 1));
-            CV_Assert(reductionIndices.type() == CV_32SC1);
+            int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
+            Pin inpId = parsePin(layer.input(0));
+            addPermuteLayer(order, name + "/nhwc", inpId);
 
-            const int numAxes = reductionIndices.total();
-            if (getDataLayout(name, data_layouts) == DATA_LAYOUT_NHWC)
-                for (int i = 0; i < numAxes; ++i)
-                    reductionIndices.at<int>(i) = toNCHW(reductionIndices.at<int>(i));
+            layerParams.set("pool", pool_type);
+            layerParams.set("kernel_h", 1);
+            layerParams.set("global_pooling_w", true);
+            int id = dstNet.addLayer(name, "Pooling", layerParams);
+            layer_id[name] = id;
+            connect(layer_id, dstNet, inpId, id, 0);
 
-            cv::sort(reductionIndices, reductionIndices, SORT_ASCENDING);
-            for (int i = 1; i < numAxes; ++i)
+            if (!keepDims)
             {
-                CV_Assert(reductionIndices.at<int>(i) == reductionIndices.at<int>(i - 1) + 1);
-                // Axes have the same sign.
-                CV_Assert(reductionIndices.at<int>(i) * reductionIndices.at<int>(i - 1) >= 0);
+                LayerParams squeezeLp;
+                std::string squeezeName = name + "/squeeze";
+                CV_Assert(layer_id.find(squeezeName) == layer_id.end());
+                int channel_id = 3; // TF NHWC layout
+                squeezeLp.set("axis", channel_id - 1);
+                squeezeLp.set("end_axis", channel_id);
+                int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
+                layer_id[squeezeName] = squeezeId;
+                connect(layer_id, dstNet, Pin(name), squeezeId, 0);
             }
-            layerParams.set("start_axis", reductionIndices.at<int>(0));
-            layerParams.set("end_axis", reductionIndices.at<int>(numAxes - 1));
-
-            int id = dstNet.addLayer(name, "Normalize", layerParams);
-            layer_id[name] = id;
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-        }
-        else if (type == "PriorBox")
-        {
-            CV_CheckEQ(num_inputs, 2, "");
-            if (hasLayerAttr(layer, "min_size"))
-                layerParams.set("min_size", getLayerAttr(layer, "min_size").i());
-            if (hasLayerAttr(layer, "max_size"))
-                layerParams.set("max_size", getLayerAttr(layer, "max_size").i());
-            if (hasLayerAttr(layer, "flip"))
-                layerParams.set("flip", getLayerAttr(layer, "flip").b());
-            if (hasLayerAttr(layer, "clip"))
-                layerParams.set("clip", getLayerAttr(layer, "clip").b());
-            if (hasLayerAttr(layer, "offset"))
-                layerParams.set("offset", getLayerAttr(layer, "offset").f());
-            if (hasLayerAttr(layer, "step"))
-                layerParams.set("step", getLayerAttr(layer, "step").f());
-
-            const std::string paramNames[] = {"variance", "aspect_ratio", "scales",
-                                              "width", "height"};
-            for (int i = 0; i < 5; ++i)
+            else
             {
-                if (hasLayerAttr(layer, paramNames[i]))
-                {
-                    Mat values = getTensorContent(getLayerAttr(layer, paramNames[i]).tensor());
-                    layerParams.set(paramNames[i],
-                                    DictValue::arrayReal<float*>((float*)values.data, values.total()));
-                }
+                int order[] = {0, 3, 1, 2};  // From NHWC to OpenCV's NCHW.
+                Pin inpId = parsePin(name);
+                addPermuteLayer(order, name + "/nchw", inpId);
             }
-            int id = dstNet.addLayer(name, "PriorBox", layerParams);
-            layer_id[name] = id;
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
-            data_layouts[name] = DATA_LAYOUT_UNKNOWN;
         }
-        else if (type == "Softmax")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            if (hasLayerAttr(layer, "axis"))
-                layerParams.set("axis", getLayerAttr(layer, "axis").i());
+    } else {
+        if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
+            CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean or reduce_sum operation.");
 
-            int id = dstNet.addLayer(name, "Softmax", layerParams);
-            layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
-        }
-        else if (type == "CropAndResize")
+        layerParams.set("pool", pool_type);
+        layerParams.set("global_pooling", true);
+        int id = dstNet.addLayer(name, "Pooling", layerParams);
+        layer_id[name] = id;
+        connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+
+        if (!keepDims)
         {
-            // op: "CropAndResize"
-            // input: "input"
-            // input: "boxes"
-            // input: "sizes"
-            CV_CheckEQ(num_inputs, 3, "");
+            LayerParams flattenLp;
+            std::string flattenName = name + "/flatten";
+            CV_Assert(layer_id.find(flattenName) == layer_id.end());
+            int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
+            layer_id[flattenName] = flattenId;
+            connect(layer_id, dstNet, Pin(name), flattenId, 0);
+        }
+    }
+}
 
-            Mat cropSize = getTensorContent(getConstBlob(layer, value_id, 2));
-            CV_CheckTypeEQ(cropSize.type(), CV_32SC1, ""); CV_CheckEQ(cropSize.total(), (size_t)2, "");
+void TFImporter::parsePack(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: tf.stack(list of tensors, axis=0)
+    // Join a list of inputs along a new axis.
+    // The "axis" specifies the index of the new axis in the dimensions of the output.
+    // Example: given a list with "N" tensors of shape (C, H, W):
+    // if axis == 0 then the output tensor will have the shape (N, C, H, W),
+    // if axis == 1 then the output tensor will have the shape (C, N, H, W).
+
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    CV_Assert(hasLayerAttr(layer, "axis"));
+    int dim = (int)getLayerAttr(layer, "axis").i();
+    if (dim != 0)
+        CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation.");
+
+    CV_Assert(hasLayerAttr(layer, "N"));
+    int num = (int)getLayerAttr(layer, "N").i();
+    CV_CheckEQ(num_inputs, num, "");
+    std::string base_name = name + "/reshape_";
+    std::vector<int> reshape_ids;
+    for (int i = 0; i < num; i++) {
+        std::ostringstream ss;
+        ss << i;
+        std::string reshape_name = base_name + ss.str();
+        LayerParams reshapeLP;
+        reshapeLP.set("axis", dim);
+        reshapeLP.set("num_axes", 1);
+        int outShape[] = {1, -1};
+        reshapeLP.set("dim", DictValue::arrayInt(&outShape[0], 2));
+        int id = dstNet.addLayer(reshape_name, "Reshape", reshapeLP);
+        layer_id[reshape_name] = id;
+        reshape_ids.push_back(id);
+        connect(layer_id, dstNet, parsePin(layer.input(i)), id, 0);
+    }
 
-            layerParams.set("height", cropSize.at<int>(0));
-            layerParams.set("width", cropSize.at<int>(1));
+    layerParams.set("axis", dim);
+    int id = dstNet.addLayer(name, "Concat", layerParams);
+    layer_id[name] = id;
 
-            int id = dstNet.addLayer(name, "CropAndResize", layerParams);
-            layer_id[name] = id;
+    for (int li = 0; li < num; li++)
+        dstNet.connect(reshape_ids[li], 0, id, li);
+}
 
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-            connect(layer_id, dstNet, parsePin(layer.input(1)), id, 1);
-        }
-        else if (type == "Mean" || type == "Sum")
-        {
-            // Computes the mean of elements across dimensions of a tensor.
-            // If keepdims is false (default) reduces input_tensor along the dimensions given in axis,
-            // else the reduced dimensions are retained with length 1.
-            // if indices = [1, 2] in NHWC layout we use global pooling: NxCxHxW --Pooling--> NxCx1x1
-            // if keepdims is false we use Flatten after Pooling: out_shape = NxC
-            // if indices = [0] we use a global pooling by indices.
-            // To return correct shape, we use Reshape after Pooling. To determine input shape use Slice for input,
-            // if keepdims is false we use Flatten after Slice.
-            // Example: input_shape = NxCxHxW
-            // determine out shape: NxCxHxW --Slice--> 1xCxHxW
-            //                      out_shape = 1xCxHxW if keepDims else (1xCxHxW --Flatten--> CxHxW)
-            // global pool: NxCxHxW --Flatten--> Nx(C*H*W) --Reshape--> 1x1xNx(C*H*W) --Pooling--> 1x1x1x(C*H*W) --Reshape--> out_shape
-            CV_CheckGT(num_inputs, 0, "");
-
-            Mat indices = getTensorContent(getConstBlob(layer, value_id, 1));
-            CV_Assert(indices.type() == CV_32SC1);
-
-            // There are two attributes, "keepdims" and a deprecated "keep_dims".
-            bool keepDims = false;
-            if (hasLayerAttr(layer, "keepdims"))
-                keepDims = getLayerAttr(layer, "keepdims").b();
-            else if (hasLayerAttr(layer, "keep_dims"))
-                keepDims = getLayerAttr(layer, "keep_dims").b();
-
-            if (indices.total() == 1 && indices.at<int>(0) == 0)
-            {
-                LayerParams flattenLp;
-                std::string flattenName = name + "/flatten";
-                CV_Assert(layer_id.find(flattenName) == layer_id.end());
-                int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
-                layer_id[flattenName] = flattenId;
-                connect(layer_id, dstNet, parsePin(layer.input(0)), flattenId, 0);
-
-                LayerParams reshapeLp;
-                std::string reshapeName = name + "/reshape";
-                CV_Assert(layer_id.find(reshapeName) == layer_id.end());
-                reshapeLp.set("axis", 0);
-                reshapeLp.set("num_axes", 1);
-                int newShape[] = {1, 1, -1};
-                reshapeLp.set("dim", DictValue::arrayInt(&newShape[0], 3));
-
-                int reshapeId = dstNet.addLayer(reshapeName, "Reshape", reshapeLp);
-                layer_id[reshapeName] = reshapeId;
-                connect(layer_id, dstNet, Pin(flattenName), reshapeId, 0);
-
-                LayerParams avgLp;
-                std::string avgName = name + "/avg";
-                CV_Assert(layer_id.find(avgName) == layer_id.end());
-                avgLp.set("pool", type == "Mean" ? "ave" : "sum");
-                // pooling kernel H x 1
-                avgLp.set("global_pooling_h", true);
-                avgLp.set("kernel_w", 1);
-                int avgId = dstNet.addLayer(avgName, "Pooling", avgLp);
-                layer_id[avgName] = avgId;
-                connect(layer_id, dstNet, Pin(reshapeName), avgId, 0);
-
-                LayerParams sliceLp;
-                std::string layerShapeName = name + "/slice";
-                CV_Assert(layer_id.find(layerShapeName) == layer_id.end());
-                sliceLp.set("axis", 0);
-                int begin[] = {0};
-                int size[] = {1};
-                sliceLp.set("begin", DictValue::arrayInt(&begin[0], 1));
-                sliceLp.set("size", DictValue::arrayInt(&size[0], 1));
-                int sliceId = dstNet.addLayer(layerShapeName, "Slice", sliceLp);
-                layer_id[layerShapeName] = sliceId;
-                connect(layer_id, dstNet, Pin(layer.input(0)), sliceId, 0);
-
-                if (!keepDims)
-                {
-                    LayerParams squeezeLp;
-                    std::string squeezeName = name + "/squeeze";
-                    CV_Assert(layer_id.find(squeezeName) == layer_id.end());
-                    squeezeLp.set("axis", 0);
-                    squeezeLp.set("end_axis", 1);
-                    int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
-                    layer_id[squeezeName] = squeezeId;
-                    connect(layer_id, dstNet, Pin(layerShapeName), squeezeId, 0);
-                    layerShapeName = squeezeName;
-                }
+void TFImporter::parseClipByValue(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // op: "ClipByValue"
+    // input: "input"
+    // input: "mix"
+    // input: "max"
 
-                int id = dstNet.addLayer(name, "Reshape", layerParams);
-                layer_id[name] = id;
-                connect(layer_id, dstNet, Pin(avgName), id, 0);
-                connect(layer_id, dstNet, Pin(layerShapeName), id, 1);
-            } else if (indices.total() == 1) {
-                int axis = toNCHW(indices.at<int>(0));
-                if (axis == 2 || axis == 3)
-                {
-                    layerParams.set("pool", type == "Mean" ? "ave" : "sum");
-                    layerParams.set(axis == 2 ? "kernel_w" : "kernel_h", 1);
-                    layerParams.set(axis == 2 ? "global_pooling_h" : "global_pooling_w", true);
-                    int id = dstNet.addLayer(name, "Pooling", layerParams);
-                    layer_id[name] = id;
-                    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-
-                    if (!keepDims)
-                    {
-                        // To keep correct order after squeeze dims we first need to change layout from NCHW to NHWC
-                        LayerParams permLP;
-                        int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                        std::string permName = name + "/nchw";
-                        Pin inpId = Pin(name);
-                        addPermuteLayer(order, permName, inpId);
-
-                        LayerParams squeezeLp;
-                        std::string squeezeName = name + "/squeeze";
-                        CV_Assert(layer_id.find(squeezeName) == layer_id.end());
-                        squeezeLp.set("axis", indices.at<int>(0));
-                        squeezeLp.set("end_axis", indices.at<int>(0) + 1);
-                        int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
-                        layer_id[squeezeName] = squeezeId;
-                        connect(layer_id, dstNet, Pin(permName), squeezeId, 0);
-                    }
-                }
-                else if (axis == 1)
-                {
-                    int order[] = {0, 2, 3, 1};  // From OpenCV's NCHW to NHWC.
-                    Pin inpId = parsePin(layer.input(0));
-                    addPermuteLayer(order, name + "/nhwc", inpId);
-
-                    layerParams.set("pool", type == "Mean" ? "ave" : "sum");
-                    layerParams.set("kernel_h", 1);
-                    layerParams.set("global_pooling_w", true);
-                    int id = dstNet.addLayer(name, "Pooling", layerParams);
-                    layer_id[name] = id;
-                    connect(layer_id, dstNet, inpId, id, 0);
-
-                    if (!keepDims)
-                    {
-                        LayerParams squeezeLp;
-                        std::string squeezeName = name + "/squeeze";
-                        CV_Assert(layer_id.find(squeezeName) == layer_id.end());
-                        int channel_id = 3; // TF NHWC layout
-                        squeezeLp.set("axis", channel_id - 1);
-                        squeezeLp.set("end_axis", channel_id);
-                        int squeezeId = dstNet.addLayer(squeezeName, "Flatten", squeezeLp);
-                        layer_id[squeezeName] = squeezeId;
-                        connect(layer_id, dstNet, Pin(name), squeezeId, 0);
-                    }
-                    else
-                    {
-                        int order[] = {0, 3, 1, 2};  // From NHWC to OpenCV's NCHW.
-                        Pin inpId = parsePin(name);
-                        addPermuteLayer(order, name + "/nchw", inpId);
-                    }
-                }
-            } else {
-                if (indices.total() != 2 || indices.at<int>(0) != 1 || indices.at<int>(1) != 2)
-                    CV_Error(Error::StsNotImplemented, "Unsupported mode of reduce_mean or reduce_sum operation.");
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-                layerParams.set("pool", type == "Mean" ? "ave" : "sum");
-                layerParams.set("global_pooling", true);
-                int id = dstNet.addLayer(name, "Pooling", layerParams);
-                layer_id[name] = id;
-                connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+    CV_CheckEQ(num_inputs, 3, "");
 
-                if (!keepDims)
-                {
-                    LayerParams flattenLp;
-                    std::string flattenName = name + "/flatten";
-                    CV_Assert(layer_id.find(flattenName) == layer_id.end());
-                    int flattenId = dstNet.addLayer(flattenName, "Flatten", flattenLp);
-                    layer_id[flattenName] = flattenId;
-                    connect(layer_id, dstNet, Pin(name), flattenId, 0);
-                }
-            }
-        }
-        else if (type == "Pack")
-        {
-            // op: tf.stack(list of tensors, axis=0)
-            // Join a list of inputs along a new axis.
-            // The "axis" specifies the index of the new axis in the dimensions of the output.
-            // Example: given a list with "N" tensors of shape (C, H, W):
-            // if axis == 0 then the output tensor will have the shape (N, C, H, W),
-            // if axis == 1 then the output tensor will have the shape (C, N, H, W).
-            CV_CheckGT(num_inputs, 0, "");
-            CV_Assert(hasLayerAttr(layer, "axis"));
-            int dim = (int)getLayerAttr(layer, "axis").i();
-            if (dim != 0)
-                CV_Error(Error::StsNotImplemented, "Unsupported mode of pack operation.");
-
-            CV_Assert(hasLayerAttr(layer, "N"));
-            int num = (int)getLayerAttr(layer, "N").i();
-            CV_CheckEQ(num_inputs, num, "");
-            std::string base_name = name + "/reshape_";
-            std::vector<int> reshape_ids;
-            for (int i = 0; i < num; i++) {
-                std::ostringstream ss;
-                ss << i;
-                std::string reshape_name = base_name + ss.str();
-                LayerParams reshapeLP;
-                reshapeLP.set("axis", dim);
-                reshapeLP.set("num_axes", 1);
-                int outShape[] = {1, -1};
-                reshapeLP.set("dim", DictValue::arrayInt(&outShape[0], 2));
-                int id = dstNet.addLayer(reshape_name, "Reshape", reshapeLP);
-                layer_id[reshape_name] = id;
-                reshape_ids.push_back(id);
-                connect(layer_id, dstNet, parsePin(layer.input(i)), id, 0);
-            }
+    Mat minValue = getTensorContent(getConstBlob(layer, value_id, 1));
+    Mat maxValue = getTensorContent(getConstBlob(layer, value_id, 2));
+    CV_CheckEQ(minValue.total(), (size_t)1, ""); CV_CheckTypeEQ(minValue.type(), CV_32FC1, "");
+    CV_CheckEQ(maxValue.total(), (size_t)1, ""); CV_CheckTypeEQ(maxValue.type(), CV_32FC1, "");
 
-            layerParams.set("axis", dim);
-            int id = dstNet.addLayer(name, "Concat", layerParams);
-            layer_id[name] = id;
+    layerParams.set("min_value", minValue.at<float>(0));
+    layerParams.set("max_value", maxValue.at<float>(0));
 
-            for (int li = 0; li < num; li++)
-                dstNet.connect(reshape_ids[li], 0, id, li);
-        }
-        else if (type == "ClipByValue")
-        {
-            // op: "ClipByValue"
-            // input: "input"
-            // input: "mix"
-            // input: "max"
-            CV_CheckEQ(num_inputs, 3, "");
+    int id = dstNet.addLayer(name, "ReLU6", layerParams);
+    layer_id[name] = id;
 
-            Mat minValue = getTensorContent(getConstBlob(layer, value_id, 1));
-            Mat maxValue = getTensorContent(getConstBlob(layer, value_id, 2));
-            CV_CheckEQ(minValue.total(), (size_t)1, ""); CV_CheckTypeEQ(minValue.type(), CV_32FC1, "");
-            CV_CheckEQ(maxValue.total(), (size_t)1, ""); CV_CheckTypeEQ(maxValue.type(), CV_32FC1, "");
+    connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
+}
 
-            layerParams.set("min_value", minValue.at<float>(0));
-            layerParams.set("max_value", maxValue.at<float>(0));
+void TFImporter::parseLeakyRelu(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const int num_inputs = layer.input_size();
 
-            int id = dstNet.addLayer(name, "ReLU6", layerParams);
-            layer_id[name] = id;
+    CV_CheckGT(num_inputs, 0, "");
+    CV_Assert(hasLayerAttr(layer, "alpha"));
+    layerParams.set("negative_slope", getLayerAttr(layer, "alpha").f());
 
-            connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
-        }
-        else if (type == "LeakyRelu")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            CV_Assert(hasLayerAttr(layer, "alpha"));
-            layerParams.set("negative_slope", getLayerAttr(layer, "alpha").f());
+    int id = dstNet.addLayer(name, "ReLU", layerParams);
+    layer_id[name] = id;
+    connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+}
 
-            int id = dstNet.addLayer(name, "ReLU", layerParams);
-            layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
-        }
-        else if (type == "Abs" || type == "Tanh" || type == "Sigmoid" ||
-                 type == "Relu" || type == "Elu" || type == "Exp" ||
-                 type == "Identity" || type == "Relu6")
-        {
-            CV_CheckGT(num_inputs, 0, "");
-            std::string dnnType = type;
-            if (type == "Abs") dnnType = "AbsVal";
-            else if (type == "Tanh") dnnType = "TanH";
-            else if (type == "Relu") dnnType = "ReLU";
-            else if (type == "Relu6") dnnType = "ReLU6";
-            else if (type == "Elu") dnnType = "ELU";
+// "Abs" "Tanh" "Sigmoid" "Relu" "Elu" "Exp" "Identity" "Relu6"
+void TFImporter::parseActivation(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
+
+    CV_CheckGT(num_inputs, 0, "");
+    std::string dnnType = type;
+    if (type == "Abs") dnnType = "AbsVal";
+    else if (type == "Tanh") dnnType = "TanH";
+    else if (type == "Relu") dnnType = "ReLU";
+    else if (type == "Relu6") dnnType = "ReLU6";
+    else if (type == "Elu") dnnType = "ELU";
+
+    int id = dstNet.addLayer(name, dnnType, layerParams);
+    layer_id[name] = id;
+    connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+}
 
-            int id = dstNet.addLayer(name, dnnType, layerParams);
-            layer_id[name] = id;
-            connectToAllBlobs(layer_id, dstNet, parsePin(layer.input(0)), id, num_inputs);
+void TFImporter::parseCustomLayer(tensorflow::GraphDef& net, const tensorflow::NodeDef& layer, LayerParams& layerParams)
+{
+    // Importer does not know how to map this TensorFlow's operation onto OpenCV's layer.
+    // However we create a layer with the same type and rely that user defined a custom layer.
+
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+    const int num_inputs = layer.input_size();
+
+    // All the attributes are added to LayerParams.
+    google::protobuf::Map<std::string, tensorflow::AttrValue> attr = layer.attr();
+    for (google::protobuf::Map<std::string, tensorflow::AttrValue>::const_iterator ai = attr.begin();
+         ai != attr.end(); ++ai)
+    {
+        if (ai->second.value_case() == tensorflow::AttrValue::kS)  // string
+            layerParams.set(ai->first, ai->second.s());
+        if (ai->second.value_case() == tensorflow::AttrValue::kI)  // int64
+            layerParams.set(ai->first, ai->second.i());
+        if (ai->second.value_case() == tensorflow::AttrValue::kF)  // float
+            layerParams.set(ai->first, ai->second.f());
+        if (ai->second.value_case() == tensorflow::AttrValue::kB)  // bool
+            layerParams.set(ai->first, ai->second.b());
+    }
+
+    // All the Const input nodes are added to layer's blobs.
+    std::vector<std::string> inputsNames;
+    for (int i = 0; i < num_inputs; ++i)
+    {
+        // Check if input is a Const node.
+        if (value_id.find(layer.input(i)) != value_id.end())
+        {
+            Mat blob = getTensorContent(getConstBlob(layer, value_id, i));
+            layerParams.blobs.push_back(blob);
         }
         else
-        {
-            // Importer does not know how to map this TensorFlow's operation onto OpenCV's layer.
-            // However we create a layer with the same type and rely that user defined a custom layer.
+            inputsNames.push_back(layer.input(i));
+    }
+    int id = dstNet.addLayer(name, type, layerParams);
+    layer_id[name] = id;
 
-            // All the attributes are added to LayerParams.
-            google::protobuf::Map<std::string, tensorflow::AttrValue> attr = layer.attr();
-            for (google::protobuf::Map<std::string, tensorflow::AttrValue>::const_iterator ai = attr.begin();
-                 ai != attr.end(); ++ai)
-            {
-                if (ai->second.value_case() == tensorflow::AttrValue::kS)  // string
-                    layerParams.set(ai->first, ai->second.s());
-                if (ai->second.value_case() == tensorflow::AttrValue::kI)  // int64
-                    layerParams.set(ai->first, ai->second.i());
-                if (ai->second.value_case() == tensorflow::AttrValue::kF)  // float
-                    layerParams.set(ai->first, ai->second.f());
-                if (ai->second.value_case() == tensorflow::AttrValue::kB)  // bool
-                    layerParams.set(ai->first, ai->second.b());
-            }
+    for (int i = 0; i < inputsNames.size(); ++i)
+    {
+        connect(layer_id, dstNet, parsePin(inputsNames[i]), id, i);
+    }
+}
 
-            // All the Const input nodes are added to layer's blobs.
-            std::vector<std::string> inputsNames;
-            for (int i = 0; i < num_inputs; ++i)
-            {
-                // Check if input is a Const node.
-                if (value_id.find(layer.input(i)) != value_id.end())
-                {
-                    Mat blob = getTensorContent(getConstBlob(layer, value_id, i));
-                    layerParams.blobs.push_back(blob);
-                }
-                else
-                    inputsNames.push_back(layer.input(i));
-            }
-            int id = dstNet.addLayer(name, type, layerParams);
-            layer_id[name] = id;
+TFImporter::TFImporter(Net& net, const char *model, const char *config)
+    : layerHandler(DNN_DIAGNOSTICS_RUN ?  new TFLayerHandler(this) : nullptr),
+        dstNet(net), dispatch(buildDispatchMap())
+{
+    if (model && model[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from file: " << model);
+        ReadTFNetParamsFromBinaryFileOrDie(model, &netBin);
+    }
+    if (config && config[0])
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from file: " << config);
+        ReadTFNetParamsFromTextFileOrDie(config, &netTxt);
+    }
 
-            for (int i = 0; i < inputsNames.size(); ++i)
-            {
-                connect(layer_id, dstNet, parsePin(inputsNames[i]), id, i);
-            }
-        }
+    populateNet();
+}
+
+TFImporter::TFImporter(
+        Net& net,
+        const char *dataModel, size_t lenModel,
+        const char *dataConfig, size_t lenConfig
+)
+    :  layerHandler(DNN_DIAGNOSTICS_RUN ?  new TFLayerHandler(this) : nullptr),
+       dstNet(net), dispatch(buildDispatchMap())
+{
+    if (dataModel != NULL && lenModel > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow model from memory (" << lenModel << " bytes)");
+        ReadTFNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBin);
     }
-    catch (const std::exception& e)
+    if (dataConfig != NULL && lenConfig > 0)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: processing TensorFlow config from memory (" << lenConfig << " bytes)");
+        ReadTFNetParamsFromTextBufferOrDie(dataConfig, lenConfig, &netTxt);
+    }
+    populateNet();
+}
+
+void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
+{
+    MatShape shape;
+    blobShapeFromTensor(tensor, shape);
+    int dims = (int)shape.size();
+
+    // TODO: other blob types
+    CV_Assert(tensor.dtype() == tensorflow::DT_FLOAT ||
+              tensor.dtype() == tensorflow::DT_HALF);
+    CV_Assert(dims == 4 || dims == 5);
+
+    int out_c, input_c, depth, height, width;
+    if (dims == 4)
+    {
+        // REORDER kernel HWIO to OIHW
+        swap(shape[0], shape[2]); // IWHO
+        swap(shape[1], shape[3]); // IOHW
+        swap(shape[0], shape[1]); // OIHW
+        depth = 1; height = shape[2]; width = shape[3];
+    }
+    else
+    {
+        // REORDER kernel DHWIO to OIDHW
+        swap(shape[0], shape[4]); // OHWID
+        swap(shape[1], shape[3]); // OIWHD
+        swap(shape[2], shape[4]); // OIDHW
+        depth = shape[2]; height = shape[3]; width = shape[4];
+    }
+    out_c = shape[0]; input_c = shape[1];
+
+    dstBlob.create(shape, CV_32F);
+
+    Mat tensorContent = getTensorContent(tensor, /*no copy*/false);
+    int size = tensorContent.total();
+    CV_Assert(size == (int)dstBlob.total());
+
+    float *dstData = dstBlob.ptr<float>();
+    const float *data = reinterpret_cast<const float*>(tensorContent.data);
+
+    int total = out_c * input_c * depth * height * width;
+    for (int i_oc = 0; i_oc < out_c; i_oc++) {
+        for (int i_ic = 0; i_ic < input_c; i_ic++) {
+            for (int i_d = 0; i_d < depth; i_d++) {
+                for (int i_h = 0; i_h < height; i_h++) {
+                    for (int i_w = 0; i_w < width; i_w++) {
+                        int dst_i = input_c * depth * height * width * i_oc +
+                                    depth * height * width * i_ic + height * width * i_d + width * i_h + i_w;
+                        int src_i = out_c * input_c * width * height * i_d +
+                                    out_c * input_c * width * i_h + out_c * input_c * i_w + out_c * i_ic + i_oc;
+                        CV_Assert(dst_i < total);
+                        CV_Assert(src_i < total);
+                       dstData[dst_i] = data[src_i];
+                   }
+                }
+            }
+        }
+    }
+}
+
+void TFImporter::connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
+             const int input_layer_id, const int input_blob_id)
+{
+    std::map<String, int>::const_iterator it = layers_name_id_map.find(outPin.name);
+    if (it == layers_name_id_map.end())
+        CV_Error(Error::StsError, "Input layer not found: " + outPin.name);
+
+    std::vector<String>::iterator inpNameIt = std::find(netInputsNames.begin(), netInputsNames.end(), outPin.name);
+    int blobIndex;
+    if (inpNameIt == netInputsNames.end())
+        blobIndex = outPin.blobIndex;
+    else
+        blobIndex = inpNameIt - netInputsNames.begin();
+    network.connect(it->second, blobIndex, input_layer_id, input_blob_id);
+}
+
+void TFImporter::connectToAllBlobs(const std::map<String, int>& layer_id, Net& network, const Pin& outPin,
+                     const int input_layer_id, const int input_blobs_count)
+{
+    for (int input_blob_id = 0; input_blob_id < input_blobs_count; input_blob_id++)
+        connect(layer_id, network, outPin, input_layer_id, input_blob_id);
+}
+
+const tensorflow::TensorProto& TFImporter::getConstBlob(const tensorflow::NodeDef &layer, std::map<String, int> const_layers,
+                                              int input_blob_index, int* actual_inp_blob_idx) {
+    if (input_blob_index == -1) {
+        for(int i = 0; i < layer.input_size(); i++) {
+            Pin input = parsePin(layer.input(i));
+            if (const_layers.find(input.name) != const_layers.end()) {
+                if (input_blob_index != -1)
+                    CV_Error(Error::StsError, "More than one input is Const op");
+
+                input_blob_index = i;
+            }
+        }
+    }
+
+    if (input_blob_index == -1)
+        CV_Error(Error::StsError, "Const input blob for weights not found");
+
+    Pin kernel_inp = parsePin(layer.input(input_blob_index));
+    if (const_layers.find(kernel_inp.name) == const_layers.end())
+        CV_Error(Error::StsError, "Input [" + layer.input(input_blob_index) +
+                                  "] for node [" + layer.name() + "] not found");
+    if (kernel_inp.blobIndex != 0)
+        CV_Error(Error::StsError, "Unsupported kernel input");
+
+    if(actual_inp_blob_idx) {
+        *actual_inp_blob_idx = input_blob_index;
+    }
+
+    int nodeIdx = const_layers.at(kernel_inp.name);
+    if (nodeIdx < netBin.node_size() && netBin.node(nodeIdx).name() == kernel_inp.name)
+    {
+        return netBin.node(nodeIdx).attr().at("value").tensor();
+    }
+    else
+    {
+        CV_Assert_N(nodeIdx < netTxt.node_size(),
+                    netTxt.node(nodeIdx).name() == kernel_inp.name);
+        return netTxt.node(nodeIdx).attr().at("value").tensor();
+    }
+}
+
+static void addConstNodes(tensorflow::GraphDef& net, std::map<String, int>& const_layers,
+                          std::set<String>& layers_to_ignore)
+{
+    CV_LOG_DEBUG(NULL, "DNN/TF: addConstNodes(): handling " << net.node_size() << " nodes...");
+    for (int li = 0; li < net.node_size(); li++)
+    {
+        const tensorflow::NodeDef &layer = net.node(li);
+        String name = layer.name();
+        String type = layer.op();
+
+        //CV_LOG_DEBUG(NULL, "DNN/TF: layer_id=" << li << " - '" << name << "' @ " << type);
+
+        try
+        {
+            if (type == "Dequantize")
+            {
+                // Example of Dequantize node:
+                //   name: "conv2d_1/bias"
+                //   op: "Dequantize"
+                //   input: "conv2d_1/bias_quantized_const" (tensor of dtype DT_QUINT8)
+                //   input: "conv2d_1/bias_quantized_min"
+                //   input: "conv2d_1/bias_quantized_max"
+                //   attr { key: "T" value { type: DT_QUINT8 } }   (quantized type)
+                //   attr { key: "mode" value { s: "MIN_FIRST" } } (quantization technique)
+                CV_CheckEQ(layer.input_size(), 3, "Dequantize: 3 inputs is supported only");
+                for (int i = 0; i < 3; ++i)
+                    CV_Assert(const_layers.find(layer.input(i)) != const_layers.end());
+                CV_Assert(hasLayerAttr(layer, "mode") &&
+                          getLayerAttr(layer, "mode").s() == "MIN_FIRST");
+
+                int tensorId = const_layers[layer.input(0)];
+                int minId = const_layers[layer.input(1)];
+                int maxId = const_layers[layer.input(2)];
+
+                tensorflow::TensorProto* tensor = net.mutable_node(tensorId)
+                                                    ->mutable_attr()->at("value")
+                                                     .mutable_tensor();
+                CV_CheckEQ((int)tensor->dtype(), (int)tensorflow::DT_QUINT8, "");
+
+                Mat qMin = getTensorContent(net.node(minId).attr().at("value").tensor());
+                Mat qMax = getTensorContent(net.node(maxId).attr().at("value").tensor());
+                CV_CheckEQ(qMin.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMin.type(), CV_32FC1, "");
+                CV_CheckEQ(qMax.total(), (size_t)1, "");
+                CV_CheckTypeEQ(qMax.type(), CV_32FC1, "");
+
+                Mat content = getTensorContent(*tensor);
+
+                float minVal = qMin.at<float>(0);
+                float rangeScale = (qMax.at<float>(0) - minVal) / 255;
+                CV_Assert(rangeScale >= 0);
+                content.convertTo(content, CV_32FC1, rangeScale,
+                                  rangeScale * cvRound(minVal / rangeScale));
+
+                tensor->set_dtype(tensorflow::DT_FLOAT);
+                tensor->set_tensor_content(content.data, content.total() * content.elemSize1());
+
+                net.mutable_node(tensorId)->set_name(name);
+                CV_Assert(const_layers.insert(std::make_pair(name, tensorId)).second);
+                layers_to_ignore.insert(name);
+                continue;
+            }
+            else if (type != "Const")
+                continue;  // only Const parameters are supported
+
+            if (layer.attr().find("value") != layer.attr().end())
+            {
+                CV_Assert(const_layers.insert(std::make_pair(name, li)).second);
+            }
+            layers_to_ignore.insert(name);
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't handle node='" << name << "'. Exception: " << e.what());
+            throw;
+        }
+    }
+    CV_LOG_DEBUG(NULL, "DNN/TF: layers_to_ignore.size() = " << layers_to_ignore.size());
+}
+
+// If all inputs of specific layer have the same data layout we can say that
+// this layer's output has this data layout too. Returns DATA_LAYOUT_UNKNOWN otherwise.
+DataLayout TFImporter::predictOutputDataLayout(const tensorflow::NodeDef& layer)
+{
+    DataLayout layout = getDataLayout(layer);
+    if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from attrs)");
+        return layout;
+    }
+
+    // Determine layout by layer's inputs
+    for (int i = 0, n = layer.input_size(); i < n; ++i)
+    {
+        std::map<String, DataLayout>::const_iterator it = data_layouts.find(getNodeName(layer.input(i)));
+        if (it != data_layouts.end())
+        {
+            if (layout != DATA_LAYOUT_UNKNOWN)
+            {
+                if (it->second != layout && it->second != DATA_LAYOUT_UNKNOWN)
+                    return DATA_LAYOUT_UNKNOWN;
+            }
+            else
+                layout = it->second;
+        }
+    }
+
+    if (layout != DATA_LAYOUT_UNKNOWN)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: predictOutputDataLayout(" << layer.name() << " @ " << layer.op() << ") => " << (int)layout << " (from inputs)");
+        return layout;
+    }
+
+    // Determine layout by layer's consumers recursively.
+    std::map<String, DataLayout>::const_iterator it = data_layouts.find(layer.name());
+    CV_Assert(it != data_layouts.end());
+    return it->second;
+}
+
+void TFImporter::populateNet()
+{
+    CV_Assert(netBin.ByteSize() || netTxt.ByteSize());
+
+    CV_LOG_INFO(NULL, "DNN/TF: parsing model"
+        << (netBin.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netBin.versions().producer(), (int)netBin.versions().min_consumer()) : cv::String(" (N/A version info)"))
+        << ". Number of nodes = " << netBin.node_size()
+    );
+
+    if (netTxt.ByteSize())
+    {
+        CV_LOG_INFO(NULL, "DNN/TF: parsing config"
+            << (netTxt.has_versions() ? cv::format(" produced by TF v%d (min_consumer=%d)", (int)netTxt.versions().producer(), (int)netTxt.versions().min_consumer()) : cv::String(" (N/A version info)"))
+            << ". Number of nodes = " << netTxt.node_size()
+        );
+
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+        RemoveIdentityOps(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(config) => " << netTxt.node_size() << " nodes");
+
+        sortByExecutionOrder(netTxt);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(config) => " << netTxt.node_size() << " nodes");
+    }
+    else
+    {
+        removePhaseSwitches(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: removePhaseSwitches(model) => " << netBin.node_size() << " nodes");
+
+        RemoveIdentityOps(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: RemoveIdentityOps(model) => " << netBin.node_size() << " nodes");
+
+        simplifySubgraphs(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: simplifySubgraphs(model) => " << netBin.node_size() << " nodes");
+        sortByExecutionOrder(netBin);
+        CV_LOG_DEBUG(NULL, "DNN/TF: sortByExecutionOrder(model) => " << netBin.node_size() << " nodes");
+    }
+
+    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+
+    int layersSize = net.node_size();
+
+    // Pre-fill data layouts where they are set explicitly.
+    // Assuming that nodes are in topological order
+    for (int i = layersSize - 1; i >= 0; --i)
+    {
+        const tensorflow::NodeDef& layer = net.node(i);
+        std::string name = layer.name();
+
+        CV_LOG_DEBUG(NULL, "DNN/TF: node(" << i << " - '" << name << "') propagating layout...");
+
+        try
+        {
+            DataLayout layout = getDataLayout(layer);
+            std::map<String, DataLayout>::iterator it = data_layouts.find(name);
+            if (it != data_layouts.end())
+            {
+                if (layout != DATA_LAYOUT_UNKNOWN)
+                {
+                    if (it->second == DATA_LAYOUT_UNKNOWN)
+                        it->second = layout;
+                    else if (it->second != layout)
+                    {
+                        it->second = DATA_LAYOUT_UNKNOWN;
+                        layout = DATA_LAYOUT_UNKNOWN;
+                    }
+                }
+                else
+                    layout = it->second;
+            }
+            else
+                data_layouts[name] = layout;
+
+            // Specify input layers to have the same data layout.
+            for (int j = 0; j < layer.input_size(); ++j)
+            {
+                name = getNodeName(layer.input(j));
+                it = data_layouts.find(name);
+                if (it != data_layouts.end())
+                {
+                    if (layout != DATA_LAYOUT_UNKNOWN)
+                    {
+                        if (it->second == DATA_LAYOUT_UNKNOWN)
+                            it->second = layout;
+                        else if (it->second != layout)
+                            it->second = DATA_LAYOUT_UNKNOWN;
+                    }
+                }
+                else
+                    data_layouts[name] = layout;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            CV_LOG_ERROR(NULL, "DNN/TF: Can't propagate layout for node='" << name << "'. Exception: " << e.what());
+            throw;
+        }
+    }
+
+    addConstNodes(netBin, value_id, layers_to_ignore);
+    addConstNodes(netTxt, value_id, layers_to_ignore);
+
+    if (DNN_DIAGNOSTICS_RUN) {
+        CV_LOG_INFO(NULL, "DNN/TF: start diagnostic run!");
+        layerHandler->fillRegistry(net);
+    }
+
+    for (int li = 0; li < layersSize; li++)
+    {
+        const tensorflow::NodeDef& layer = net.node(li);
+
+        const std::string name = layer.name();
+        const std::string type = layer.op();
+        const int ninputs = layer.input_size();
+        CV_LOG_DEBUG(NULL, "DNN/TF: (" << li << "/" << layersSize << ") Parse layer " << name << " @ " << type << " with " << ninputs << " inputs");
+
+        parseNode(layer);
+    }
+
+    for (size_t i = 0; i < netInputsNames.size(); i++)
+    {
+        CV_LOG_DEBUG(NULL, "DNN/TF: Model input: " << i << " - '" << netInputsNames[i] << "'");
+        CV_Assert(!netInputsNames[i].empty());
+    }
+    dstNet.setInputsNames(netInputsNames);
+    CV_LOG_DEBUG(NULL, (DNN_DIAGNOSTICS_RUN? "DNN/TF: diagnostic run completed!" : "DNN/TF: import completed!"));
+}
+
+void TFImporter::addPermuteLayer(const int* order, const std::string& permName, Pin& inpId)
+{
+    LayerParams permLP;
+    permLP.set("order", DictValue::arrayInt<const int*>(order, 4));
+    CV_Assert(layer_id.find(permName) == layer_id.end());
+    int permId = dstNet.addLayer(permName, "Permute", permLP);
+    layer_id[permName] = permId;
+    connect(layer_id, dstNet, inpId, permId, 0);
+    inpId = Pin(permName);
+}
+
+void TFImporter::parseNode(const tensorflow::NodeDef& layer)
+{
+    tensorflow::GraphDef& net = netTxt.ByteSize() != 0 ? netTxt : netBin;
+
+    const std::string& name = layer.name();
+    const std::string& type = layer.op();
+
+    LayerParams layerParams;
+    try
+    {
+
+        if (layers_to_ignore.find(name) != layers_to_ignore.end())
+        {
+            CV_LOG_DEBUG(NULL, "DNN/TF:     ignored");
+            return;
+        }
+
+        DataLayout predictedLayout = predictOutputDataLayout(layer);
+        data_layouts[name] = predictedLayout;
+
+        DispatchMap::const_iterator iter = dispatch.find(type);
+        if (iter != dispatch.end())
+        {
+            CALL_MEMBER_FN(*this, iter->second)(net, layer, layerParams);
+        }
+        else if (!DNN_DIAGNOSTICS_RUN || !layerHandler->handleMissing(layer))
+        {
+            parseCustomLayer(net, layer, layerParams);
+        }
+    }
+    catch (const std::exception& e)
+    {
+        CV_LOG_ERROR(NULL, "DNN/TF: Can't parse layer for node='" << name << "' of type='" << type
+                                                                  << "'. Exception: " << e.what());
+
+        if (DNN_DIAGNOSTICS_RUN)
+        {
+            layerHandler->handleFailed(layer);
+        }
+        else
+        {
+            throw;
+        }
+    }
+}
+
+TFLayerHandler::TFLayerHandler(TFImporter* importer_) : importer(importer_) {}
+
+void TFLayerHandler::fillRegistry(const tensorflow::GraphDef& net)
+{
+    for (int li = 0; li < net.node_size(); li++) {
+        const tensorflow::NodeDef& layer = net.node(li);
+
+        const std::string& name = layer.name();
+        const std::string& type = layer.op();
+        if (importer->dispatch.find(type) == importer->dispatch.end())
+        {
+            addMissing(name, type);
+        }
+    }
+    printMissing();
+};
+
+bool TFLayerHandler::handleMissing(const tensorflow::NodeDef& layer)
+{
+    bool unsupported = contains(layer.op());
+
+    if (unsupported)
+    {
+        handleFailed(layer);
+    }
+
+    return unsupported;
+}
+
+void TFLayerHandler::handleFailed(const tensorflow::NodeDef& layer)
+{
+    LayerParams lp = getNotImplementedParams(layer.name(), layer.op());
+
+    // the layer will be created or its params and type will be replaced
+    int id = importer->dstNet.addLayer(lp.name, lp.type, lp);
+    if (id != -1) // internal layer failure before the call to addLayer()
     {
-        CV_LOG_ERROR(NULL, "DNN/TF: Can't parse layer for node='" << name << "'. Exception: " << e.what());
-        throw;
+        importer->layer_id[lp.name] = id;
     }
 }
 
@@ -2570,17 +2981,13 @@ void TFImporter::parseNode(const tensorflow::NodeDef& layer_)
 
 Net readNetFromTensorflow(const String &model, const String &config)
 {
-    Net net;
-    TFImporter importer(net, model.c_str(), config.c_str());
-    return net;
+    return detail::readNetDiagnostic<TFImporter>(model.c_str(), config.c_str());
 }
 
 Net readNetFromTensorflow(const char* bufferModel, size_t lenModel,
                           const char* bufferConfig, size_t lenConfig)
 {
-    Net net;
-    TFImporter importer(net, bufferModel, lenModel, bufferConfig, lenConfig);
-    return net;
+    return detail::readNetDiagnostic<TFImporter>(bufferModel, lenModel, bufferConfig, lenConfig);
 }
 
 Net readNetFromTensorflow(const std::vector<uchar>& bufferModel, const std::vector<uchar>& bufferConfig)
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index aab4c6f50774..e8c7e700f651 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -204,7 +204,7 @@ TEST_P(DNNTestNetwork, MobileNet_SSD_Caffe)
     Mat inp = blobFromImage(sample, 1.0f / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
     float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 1.5e-2 : 0.0;
     float iouDiff = (target == DNN_TARGET_MYRIAD) ? 0.063  : 0.0;
-    float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.252  : FLT_MIN;
+    float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.262  : FLT_MIN;
          processNet("dnn/MobileNetSSD_deploy.caffemodel", "dnn/MobileNetSSD_deploy.prototxt",
                     inp, "detection_out", "", scoreDiff, iouDiff, detectionConfThresh);
     expectNoFallbacksFromIE(net);
@@ -359,8 +359,8 @@ TEST_P(DNNTestNetwork, OpenPose_pose_coco)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
 
-    const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.0056 : 0.0;
-    const float lInf = (target == DNN_TARGET_MYRIAD) ? 0.072 : 0.0;
+    const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.009 : 0.0;
+    const float lInf = (target == DNN_TARGET_MYRIAD) ? 0.09 : 0.0;
     processNet("dnn/openpose_pose_coco.caffemodel", "dnn/openpose_pose_coco.prototxt",
                Size(46, 46), "", "", l1, lInf);
     expectNoFallbacksFromIE(net);
@@ -380,8 +380,8 @@ TEST_P(DNNTestNetwork, OpenPose_pose_mpi)
 #endif
 
     // output range: [-0.001, 0.97]
-    const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.012 : 0.0;
-    const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.16 : 0.0;
+    const float l1 = (target == DNN_TARGET_MYRIAD) ? 0.02 : 0.0;
+    const float lInf = (target == DNN_TARGET_MYRIAD || target == DNN_TARGET_OPENCL_FP16) ? 0.2 : 0.0;
     processNet("dnn/openpose_pose_mpi.caffemodel", "dnn/openpose_pose_mpi.prototxt",
                Size(46, 46), "", "", l1, lInf);
     expectNoFallbacksFromIE(net);
diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp
index b285e91d9654..2846f9ae7695 100644
--- a/modules/dnn/test/test_ie_models.cpp
+++ b/modules/dnn/test/test_ie_models.cpp
@@ -112,6 +112,25 @@ static const std::map<std::string, OpenVINOModelTestCaseInfo>& getOpenVINOTestMo
             "intel/age-gender-recognition-retail-0013/FP16/age-gender-recognition-retail-0013",
             "intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013"
         }},
+#endif
+#if INF_ENGINE_RELEASE >= 2021020000
+        // OMZ: 2020.2
+        { "face-detection-0105", {
+            "intel/face-detection-0105/FP32/face-detection-0105",
+            "intel/face-detection-0105/FP16/face-detection-0105"
+        }},
+        { "face-detection-0106", {
+            "intel/face-detection-0106/FP32/face-detection-0106",
+            "intel/face-detection-0106/FP16/face-detection-0106"
+        }},
+#endif
+#if INF_ENGINE_RELEASE >= 2021040000
+        // OMZ: 2021.4
+        { "person-vehicle-bike-detection-2004", {
+            "intel/person-vehicle-bike-detection-2004/FP32/person-vehicle-bike-detection-2004",
+            "intel/person-vehicle-bike-detection-2004/FP16/person-vehicle-bike-detection-2004"
+            //"intel/person-vehicle-bike-detection-2004/FP16-INT8/person-vehicle-bike-detection-2004"
+        }},
 #endif
     };
 
@@ -145,10 +164,22 @@ inline static std::string getOpenVINOModel(const std::string &modelName, bool is
 static inline void genData(const InferenceEngine::TensorDesc& desc, Mat& m, Blob::Ptr& dataPtr)
 {
     const std::vector<size_t>& dims = desc.getDims();
-    m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
-    randu(m, -1, 1);
-
-    dataPtr = make_shared_blob<float>(desc, (float*)m.data);
+    if (desc.getPrecision() == InferenceEngine::Precision::FP32)
+    {
+        m.create(std::vector<int>(dims.begin(), dims.end()), CV_32F);
+        randu(m, -1, 1);
+        dataPtr = make_shared_blob<float>(desc, (float*)m.data);
+    }
+    else if (desc.getPrecision() == InferenceEngine::Precision::I32)
+    {
+        m.create(std::vector<int>(dims.begin(), dims.end()), CV_32S);
+        randu(m, -100, 100);
+        dataPtr = make_shared_blob<int>(desc, (int*)m.data);
+    }
+    else
+    {
+        FAIL() << "Unsupported precision: " << desc.getPrecision();
+    }
 }
 
 void runIE(Target target, const std::string& xmlPath, const std::string& binPath,
@@ -254,7 +285,16 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
     BlobMap inputBlobs;
     for (auto& it : net.getInputsInfo())
     {
-        genData(it.second->getTensorDesc(), inputsMap[it.first], inputBlobs[it.first]);
+        const InferenceEngine::TensorDesc& desc = it.second->getTensorDesc();
+        genData(desc, inputsMap[it.first], inputBlobs[it.first]);
+        if (cvtest::debugLevel > 0)
+        {
+            const std::vector<size_t>& dims = desc.getDims();
+            std::cout << "Input: '" << it.first << "' precison=" << desc.getPrecision() << " dims=" << dims.size() << " [";
+            for (auto d : dims)
+                std::cout << " " << d;
+            std::cout << "]  ocv_mat=" << inputsMap[it.first].size << " of " << typeToString(inputsMap[it.first].type()) << std::endl;
+        }
     }
     infRequest.SetInput(inputBlobs);
 
@@ -263,7 +303,16 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
     BlobMap outputBlobs;
     for (auto& it : net.getOutputsInfo())
     {
-        genData(it.second->getTensorDesc(), outputsMap[it.first], outputBlobs[it.first]);
+        const InferenceEngine::TensorDesc& desc = it.second->getTensorDesc();
+        genData(desc, outputsMap[it.first], outputBlobs[it.first]);
+        if (cvtest::debugLevel > 0)
+        {
+            const std::vector<size_t>& dims = desc.getDims();
+            std::cout << "Output: '" << it.first << "' precison=" << desc.getPrecision() << " dims=" << dims.size() << " [";
+            for (auto d : dims)
+                std::cout << " " << d;
+            std::cout << "]  ocv_mat=" << outputsMap[it.first].size << " of " << typeToString(outputsMap[it.first].type()) << std::endl;
+        }
     }
     infRequest.SetOutput(outputBlobs);
 
@@ -284,6 +333,12 @@ void runCV(Backend backendId, Target targetId, const std::string& xmlPath, const
     net.setPreferableTarget(targetId);
 
     std::vector<String> outNames = net.getUnconnectedOutLayersNames();
+    if (cvtest::debugLevel > 0)
+    {
+        std::cout << "OpenCV output names: " << outNames.size() << std::endl;
+        for (auto name : outNames)
+            std::cout << "- " << name << std::endl;
+    }
     std::vector<Mat> outs;
     net.forward(outs, outNames);
 
@@ -307,6 +362,28 @@ TEST_P(DNNTestOpenVINO, models)
     ASSERT_FALSE(backendId != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && backendId != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) <<
         "Inference Engine backend is required";
 
+#if INF_ENGINE_VER_MAJOR_GE(2021030000)
+    if (targetId == DNN_TARGET_MYRIAD && (false
+            || modelName == "person-detection-retail-0013"  // ncDeviceOpen:1013 Failed to find booted device after boot
+            || modelName == "age-gender-recognition-retail-0013"  // ncDeviceOpen:1013 Failed to find booted device after boot
+            || modelName == "face-detection-0105"  // get_element_type() must be called on a node with exactly one output
+            || modelName == "face-detection-0106"  // get_element_type() must be called on a node with exactly one output
+            || modelName == "person-vehicle-bike-detection-2004"  // 2021.4+: ncDeviceOpen:1013 Failed to find booted device after boot
+        )
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (targetId == DNN_TARGET_OPENCL && (false
+            || modelName == "face-detection-0106"  // Operation: 2278 of type ExperimentalDetectronPriorGridGenerator(op::v6) is not supported
+        )
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (targetId == DNN_TARGET_OPENCL_FP16 && (false
+            || modelName == "face-detection-0106"  // Operation: 2278 of type ExperimentalDetectronPriorGridGenerator(op::v6) is not supported
+        )
+    )
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
 #if INF_ENGINE_VER_MAJOR_GE(2020020000)
     if (targetId == DNN_TARGET_MYRIAD && backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
     {
@@ -343,6 +420,8 @@ TEST_P(DNNTestOpenVINO, models)
     if (targetId == DNN_TARGET_HDDL)
         releaseHDDLPlugin();
     EXPECT_NO_THROW(runIE(targetId, xmlPath, binPath, inputsMap, ieOutputsMap)) << "runIE";
+    if (targetId == DNN_TARGET_MYRIAD)
+        resetMyriadDevice();
     EXPECT_NO_THROW(runCV(backendId, targetId, xmlPath, binPath, inputsMap, cvOutputsMap)) << "runCV";
 
     double eps = 0;
@@ -350,6 +429,14 @@ TEST_P(DNNTestOpenVINO, models)
     if (targetId == DNN_TARGET_CPU && checkHardwareSupport(CV_CPU_AVX_512F))
         eps = 1e-5;
 #endif
+#if INF_ENGINE_VER_MAJOR_GE(2021030000)
+    if (targetId == DNN_TARGET_CPU && modelName == "face-detection-0105")
+        eps = 2e-4;
+#endif
+#if INF_ENGINE_VER_MAJOR_GE(2021040000)
+    if (targetId == DNN_TARGET_CPU && modelName == "person-vehicle-bike-detection-2004")
+        eps = 1e-6;
+#endif
 
     EXPECT_EQ(ieOutputsMap.size(), cvOutputsMap.size());
     for (auto& srcIt : ieOutputsMap)
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
new file mode 100644
index 000000000000..1fcb1d0dba7c
--- /dev/null
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -0,0 +1,1220 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include "npy_blob.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/dnn/all_layers.hpp>
+namespace opencv_test { namespace {
+
+template<typename TString>
+static std::string _tf(TString filename)
+{
+    return (getOpenCVExtraDir() + "dnn/") + filename;
+}
+
+class Test_Int8_layers : public DNNTestLayer
+{
+public:
+    void testLayer(const String& basename, const String& importer, double l1, double lInf,
+                   int numInps = 1, int numOuts = 1, bool useCaffeModel = false,
+                   bool useCommonInputBlob = true, bool hasText = false)
+    {
+        CV_Assert_N(numInps >= 1, numInps <= 10, numOuts >= 1, numOuts <= 10);
+        std::vector<Mat> inps(numInps), inps_int8(numInps);
+        std::vector<Mat> refs(numOuts), outs_int8(numOuts), outs_dequantized(numOuts);
+        std::vector<float> inputScale, outputScale;
+        std::vector<int> inputZp, outputZp;
+        String inpPath, outPath;
+        Net net, qnet;
+
+        if (importer == "Caffe")
+        {
+            String prototxt = _tf("layers/" + basename + ".prototxt");
+            String caffemodel = _tf("layers/" + basename + ".caffemodel");
+            net = readNetFromCaffe(prototxt, useCaffeModel ? caffemodel : String());
+
+            inpPath = _tf("layers/" + (useCommonInputBlob ? "blob" : basename + ".input"));
+            outPath =  _tf("layers/" + basename);
+        }
+        else if (importer == "TensorFlow")
+        {
+            String netPath = _tf("tensorflow/" + basename + "_net.pb");
+            String netConfig = hasText ? _tf("tensorflow/" + basename + "_net.pbtxt") : "";
+            net = readNetFromTensorflow(netPath, netConfig);
+
+            inpPath = _tf("tensorflow/" + basename + "_in");
+            outPath = _tf("tensorflow/" + basename + "_out");
+        }
+        else if (importer == "ONNX")
+        {
+            String onnxmodel = _tf("onnx/models/" + basename + ".onnx");
+            net = readNetFromONNX(onnxmodel);
+
+            inpPath = _tf("onnx/data/input_" + basename);
+            outPath = _tf("onnx/data/output_" + basename);
+        }
+        ASSERT_FALSE(net.empty());
+        net.setPreferableBackend(backend);
+        net.setPreferableTarget(target);
+
+        for (int i = 0; i < numInps; i++)
+            inps[i] = blobFromNPY(inpPath + ((numInps > 1) ? cv::format("_%d.npy", i) : ".npy"));
+
+        for (int i = 0; i < numOuts; i++)
+            refs[i] = blobFromNPY(outPath + ((numOuts > 1) ? cv::format("_%d.npy", i) : ".npy"));
+
+        qnet = net.quantize(inps, CV_8S, CV_8S);
+        qnet.getInputDetails(inputScale, inputZp);
+        qnet.getOutputDetails(outputScale, outputZp);
+
+        // Quantize inputs to int8
+        // int8_value = float_value/scale + zero-point
+        for (int i = 0; i < numInps; i++)
+        {
+            inps[i].convertTo(inps_int8[i], CV_8S, 1.f/inputScale[i], inputZp[i]);
+            String inp_name = numInps > 1 ? (importer == "Caffe" ? cv::format("input_%d", i) : cv::format("%d", i)) : "";
+            qnet.setInput(inps_int8[i], inp_name);
+        }
+        qnet.forward(outs_int8);
+
+        // Dequantize outputs and compare with reference outputs
+        // float_value = scale*(int8_value - zero-point)
+        for (int i = 0; i < numOuts; i++)
+        {
+            outs_int8[i].convertTo(outs_dequantized[i], CV_32F, outputScale[i], -(outputScale[i] * outputZp[i]));
+            normAssert(refs[i], outs_dequantized[i], "", l1, lInf);
+        }
+    }
+};
+
+TEST_P(Test_Int8_layers, Convolution1D)
+{
+    testLayer("conv1d", "ONNX", 0.00302, 0.00909);
+    testLayer("conv1d_bias", "ONNX", 0.00306, 0.00948);
+}
+
+TEST_P(Test_Int8_layers, Convolution2D)
+{
+    testLayer("layer_convolution", "Caffe", 0.0174, 0.0758, 1, 1, true);
+    testLayer("single_conv", "TensorFlow", 0.00413, 0.02201);
+    testLayer("depthwise_conv2d", "TensorFlow", 0.0388, 0.169);
+    testLayer("atrous_conv2d_valid", "TensorFlow", 0.0193, 0.0633);
+    testLayer("atrous_conv2d_same", "TensorFlow", 0.0185, 0.1322);
+    testLayer("keras_atrous_conv2d_same", "TensorFlow", 0.0056, 0.0244);
+    testLayer("convolution", "ONNX", 0.0052, 0.01516);
+    testLayer("two_convolution", "ONNX", 0.00295, 0.00840);
+}
+
+TEST_P(Test_Int8_layers, Convolution3D)
+{
+    testLayer("conv3d", "TensorFlow", 0.00734, 0.02434);
+    testLayer("conv3d", "ONNX", 0.00353, 0.00941);
+    testLayer("conv3d_bias", "ONNX", 0.00129, 0.00249);
+}
+
+TEST_P(Test_Int8_layers, Flatten)
+{
+    testLayer("flatten", "TensorFlow", 0.0036, 0.0069, 1, 1, false, true, true);
+    testLayer("unfused_flatten", "TensorFlow", 0.0014, 0.0028);
+    testLayer("unfused_flatten_unknown_batch", "TensorFlow", 0.0043, 0.0051);
+}
+
+TEST_P(Test_Int8_layers, Padding)
+{
+    testLayer("padding_valid", "TensorFlow", 0.0026, 0.0064);
+    testLayer("padding_same", "TensorFlow", 0.0081, 0.032);
+    testLayer("spatial_padding", "TensorFlow", 0.0078, 0.028);
+    testLayer("mirror_pad", "TensorFlow", 0.0064, 0.013);
+    testLayer("pad_and_concat", "TensorFlow", 0.0021, 0.0098);
+    testLayer("padding", "ONNX", 0.0005, 0.0069);
+    testLayer("ReflectionPad2d", "ONNX", 0.00062, 0.0018);
+    testLayer("ZeroPad2d", "ONNX", 0.00037, 0.0018);
+}
+
+TEST_P(Test_Int8_layers, AvePooling)
+{
+    testLayer("layer_pooling_ave", "Caffe", 0.0021, 0.0075);
+    testLayer("ave_pool_same", "TensorFlow", 0.00153, 0.0041);
+    testLayer("average_pooling_1d", "ONNX", 0.002, 0.0048);
+    testLayer("average_pooling", "ONNX", 0.0014, 0.0032);
+    testLayer("average_pooling_dynamic_axes", "ONNX", 0.0014, 0.006);
+
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("ave_pool3d", "TensorFlow", 0.00175, 0.0047);
+    testLayer("ave_pool3d", "ONNX", 0.00063, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, MaxPooling)
+{
+    testLayer("pool_conv_1d", "ONNX", 0.0006, 0.0015);
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("pool_conv_3d", "ONNX", 0.0033, 0.0124);
+
+    /* All the below tests have MaxPooling as last layer, so computeMaxIdx is set to true
+       which is not supported by int8 maxpooling
+    testLayer("layer_pooling_max", "Caffe", 0.0021, 0.004);
+    testLayer("max_pool_even", "TensorFlow", 0.0048, 0.0139);
+    testLayer("max_pool_odd_valid", "TensorFlow", 0.0043, 0.012);
+    testLayer("conv_pool_nchw", "TensorFlow", 0.007, 0.025);
+    testLayer("max_pool3d", "TensorFlow", 0.0025, 0.0058);
+    testLayer("maxpooling_1d", "ONNX", 0.0018, 0.0037);
+    testLayer("two_maxpooling_1d", "ONNX", 0.0037, 0.0052);
+    testLayer("maxpooling", "ONNX", 0.0034, 0.0065);
+    testLayer("two_maxpooling", "ONNX", 0.0025, 0.0052);
+    testLayer("max_pool3d", "ONNX", 0.0028, 0.0069);*/
+}
+
+TEST_P(Test_Int8_layers, Reduce)
+{
+    testLayer("reduce_mean", "TensorFlow", 0.0005, 0.0014);
+    testLayer("reduce_mean", "ONNX", 0.00062, 0.0014);
+    testLayer("reduce_mean_axis1", "ONNX", 0.00032, 0.0007);
+    testLayer("reduce_mean_axis2", "ONNX", 0.00033, 0.001);
+
+    testLayer("reduce_sum", "TensorFlow", 0.015, 0.031);
+    testLayer("reduce_sum_channel", "TensorFlow", 0.008, 0.019);
+    testLayer("sum_pool_by_axis", "TensorFlow", 0.012, 0.032);
+    testLayer("reduce_sum", "ONNX", 0.0025, 0.0048);
+
+    testLayer("reduce_max", "ONNX", 0, 0);
+    testLayer("reduce_max_axis_0", "ONNX", 0.0042, 0.007);
+    testLayer("reduce_max_axis_1", "ONNX", 0.0018, 0.0036);
+
+    if (target != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+    testLayer("reduce_mean3d", "ONNX", 0.00048, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, ReLU)
+{
+    testLayer("layer_relu", "Caffe", 0.0005, 0.002);
+    testLayer("ReLU", "ONNX", 0.0012, 0.0047);
+}
+
+TEST_P(Test_Int8_layers, LeakyReLU)
+{
+    testLayer("leaky_relu", "TensorFlow", 0.0002, 0.0004);
+}
+
+TEST_P(Test_Int8_layers, ReLU6)
+{
+    testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062);
+    testLayer("keras_relu6", "TensorFlow", 0.0018, 0.0062, 1, 1, false, true, true);
+    testLayer("clip_by_value", "TensorFlow", 0.0009, 0.002);
+    testLayer("clip", "ONNX", 0.00006, 0.00037);
+}
+
+TEST_P(Test_Int8_layers, Sigmoid)
+{
+    testLayer("maxpooling_sigmoid", "ONNX", 0.0011, 0.0032);
+    testLayer("maxpooling_sigmoid_dynamic_axes", "ONNX", 0.0011, 0.0032);
+    testLayer("maxpooling_sigmoid_1d", "ONNX", 0.0011, 0.0037);
+}
+
+TEST_P(Test_Int8_layers, Mish)
+{
+    testLayer("mish", "ONNX", 0.0015, 0.0025);
+}
+
+TEST_P(Test_Int8_layers, Softmax)
+{
+    testLayer("layer_softmax", "Caffe", 0.0011, 0.0036);
+    testLayer("keras_softmax", "TensorFlow", 0.00093, 0.0027);
+    testLayer("slim_softmax", "TensorFlow", 0.0016, 0.0034);
+    testLayer("slim_softmax_v2", "TensorFlow", 0.0029, 0.017);
+    testLayer("softmax", "ONNX", 0.0016, 0.0028);
+    testLayer("log_softmax", "ONNX", 0.014, 0.025);
+    testLayer("softmax_unfused", "ONNX", 0.0009, 0.0021);
+}
+
+TEST_P(Test_Int8_layers, Concat)
+{
+    testLayer("layer_concat_shared_input", "Caffe", 0.0076, 0.029, 1, 1, true, false);
+    testLayer("concat_axis_1", "TensorFlow", 0.0056, 0.017);
+    testLayer("keras_pad_concat", "TensorFlow", 0.0032, 0.0089);
+    testLayer("concat_3d", "TensorFlow", 0.005, 0.014);
+    testLayer("concatenation", "ONNX", 0.0032, 0.009);
+}
+
+TEST_P(Test_Int8_layers, BatchNorm)
+{
+    testLayer("layer_batch_norm", "Caffe", 0.0061, 0.019, 1, 1, true);
+    testLayer("fused_batch_norm", "TensorFlow", 0.0063, 0.02);
+    testLayer("batch_norm_text", "TensorFlow", 0.0048, 0.013, 1, 1, false, true, true);
+    testLayer("unfused_batch_norm", "TensorFlow", 0.0076, 0.019);
+    testLayer("fused_batch_norm_no_gamma", "TensorFlow", 0.0067, 0.015);
+    testLayer("unfused_batch_norm_no_gamma", "TensorFlow", 0.0123, 0.044);
+    testLayer("switch_identity", "TensorFlow", 0.0035, 0.011);
+    testLayer("batch_norm3d", "TensorFlow", 0.0077, 0.02);
+    testLayer("batch_norm", "ONNX", 0.0012, 0.0049);
+    testLayer("batch_norm_3d", "ONNX", 0.0039, 0.012);
+    testLayer("frozenBatchNorm2d", "ONNX", 0.001, 0.0018);
+    testLayer("batch_norm_subgraph", "ONNX", 0.0049, 0.0098);
+}
+
+TEST_P(Test_Int8_layers, Scale)
+{
+    testLayer("batch_norm", "TensorFlow", 0.0028, 0.0098);
+    testLayer("scale", "ONNX", 0.0025, 0.0071);
+    testLayer("expand_hw", "ONNX", 0.0012, 0.0012);
+    testLayer("flatten_const", "ONNX", 0.0024, 0.0048);
+}
+
+TEST_P(Test_Int8_layers, InnerProduct)
+{
+    testLayer("layer_inner_product", "Caffe", 0.005, 0.02, 1, 1, true);
+    testLayer("matmul", "TensorFlow", 0.0061, 0.019);
+    testLayer("nhwc_transpose_reshape_matmul", "TensorFlow", 0.0009, 0.0091);
+    testLayer("nhwc_reshape_matmul", "TensorFlow", 0.03, 0.071);
+    testLayer("matmul_layout", "TensorFlow", 0.035, 0.06);
+    testLayer("tf2_dense", "TensorFlow", 0, 0);
+    testLayer("matmul_add", "ONNX", 0.041, 0.082);
+    testLayer("linear", "ONNX", 0.0018, 0.0029);
+    testLayer("constant", "ONNX", 0.00021, 0.0006);
+    testLayer("lin_with_constant", "ONNX", 0.0011, 0.0016);
+}
+
+TEST_P(Test_Int8_layers, Reshape)
+{
+    testLayer("reshape_layer", "TensorFlow", 0.0032, 0.0082);
+    testLayer("reshape_nchw", "TensorFlow", 0.0089, 0.029);
+    testLayer("reshape_conv", "TensorFlow", 0.035, 0.054);
+    testLayer("reshape_reduce", "TensorFlow", 0.0042, 0.0078);
+    testLayer("reshape_as_shape", "TensorFlow", 0.0014, 0.0028);
+    testLayer("reshape_no_reorder", "TensorFlow", 0.0014, 0.0028);
+    testLayer("shift_reshape_no_reorder", "TensorFlow", 0.0063, 0.014);
+    testLayer("dynamic_reshape", "ONNX", 0.0047, 0.0079);
+    testLayer("dynamic_reshape_opset_11", "ONNX", 0.0048, 0.0081);
+    testLayer("flatten_by_prod", "ONNX", 0.0048, 0.0081);
+    testLayer("squeeze", "ONNX", 0.0048, 0.0081);
+    testLayer("unsqueeze", "ONNX", 0.0033, 0.0053);
+    testLayer("squeeze_and_conv_dynamic_axes", "ONNX", 0.0054, 0.0154);
+    testLayer("unsqueeze_and_conv_dynamic_axes", "ONNX", 0.0037, 0.0151);
+}
+
+TEST_P(Test_Int8_layers, Permute)
+{
+    testLayer("tf2_permute_nhwc_ncwh", "TensorFlow", 0.0028, 0.006);
+    testLayer("transpose", "ONNX", 0.0015, 0.0046);
+}
+
+TEST_P(Test_Int8_layers, Identity)
+{
+    testLayer("expand_batch", "ONNX", 0.0027, 0.0036);
+    testLayer("expand_channels", "ONNX", 0.0013, 0.0019);
+    testLayer("expand_neg_batch", "ONNX", 0.00071, 0.0019);
+}
+
+TEST_P(Test_Int8_layers, Slice)
+{
+    testLayer("split", "TensorFlow", 0.0033, 0.0056);
+    testLayer("slice_4d", "TensorFlow", 0.003, 0.0073);
+    testLayer("strided_slice", "TensorFlow", 0.008, 0.0142);
+    testLayer("slice", "ONNX", 0.0046, 0.0077);
+    testLayer("slice_dynamic_axes", "ONNX", 0.0039, 0.0084);
+    testLayer("slice_opset_11_steps_2d", "ONNX", 0.0052, 0.0124);
+    testLayer("slice_opset_11_steps_3d", "ONNX", 0.0068, 0.014);
+    testLayer("slice_opset_11_steps_4d", "ONNX", 0.0041, 0.008);
+    testLayer("slice_opset_11_steps_5d", "ONNX", 0.0085, 0.021);
+}
+
+TEST_P(Test_Int8_layers, Dropout)
+{
+    testLayer("layer_dropout", "Caffe", 0.0021, 0.004);
+    testLayer("dropout", "ONNX", 0.0029, 0.004);
+}
+
+TEST_P(Test_Int8_layers, Eltwise)
+{
+    testLayer("layer_eltwise", "Caffe", 0.062, 0.15);
+    testLayer("conv_2_inps", "Caffe", 0.0086, 0.0232, 2, 1, true, false);
+    testLayer("eltwise_sub", "TensorFlow", 0.015, 0.047);
+    testLayer("eltwise_add_vec", "TensorFlow", 0.037, 0.21); // tflite 0.0095, 0.0365
+    testLayer("eltwise_mul_vec", "TensorFlow", 0.173, 1.14); // tflite 0.0028, 0.017
+    testLayer("channel_broadcast", "TensorFlow", 0.0025, 0.0063);
+    testLayer("split_equals", "TensorFlow", 0.02, 0.065);
+    testLayer("mul", "ONNX", 0.0039, 0.014);
+    testLayer("split_max", "ONNX", 0.004, 0.012);
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_layers, dnnBackendsAndTargets());
+
+class Test_Int8_nets : public DNNTestLayer
+{
+public:
+    void testClassificationNet(Net baseNet, const Mat& blob, const Mat& ref, double l1, double lInf)
+    {
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+        normAssert(ref, out, "", l1, lInf);
+    }
+
+    void testDetectionNet(Net baseNet, const Mat& blob, const Mat& ref,
+                          double confThreshold, double scoreDiff, double iouDiff)
+    {
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+        normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
+    }
+
+    void testFaster(Net baseNet, const Mat& ref, double confThreshold, double scoreDiff, double iouDiff)
+    {
+        Mat inp = imread(_tf("dog416.png"));
+        resize(inp, inp, Size(800, 600));
+        Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(102.9801, 115.9465, 122.7717), false, false);
+        Mat imInfo = (Mat_<float>(1, 3) << inp.rows, inp.cols, 1.6f);
+
+        Net qnet = baseNet.quantize(std::vector<Mat>{blob, imInfo}, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+
+        qnet.setInput(blob, "data");
+        qnet.setInput(imInfo, "im_info");
+        Mat out = qnet.forward();
+        normAssertDetections(ref, out, "", confThreshold, scoreDiff, iouDiff);
+    }
+
+    void testONNXNet(const String& basename, double l1, double lInf, bool useSoftmax = false)
+    {
+        String onnxmodel = findDataFile("dnn/onnx/models/" + basename + ".onnx", false);
+
+        Mat blob = readTensorFromONNX(findDataFile("dnn/onnx/data/input_" + basename + ".pb"));
+        Mat ref = readTensorFromONNX(findDataFile("dnn/onnx/data/output_" + basename + ".pb"));
+        Net baseNet = readNetFromONNX(onnxmodel);
+        baseNet.setPreferableBackend(backend);
+        baseNet.setPreferableTarget(target);
+
+        Net qnet = baseNet.quantize(blob, CV_32F, CV_32F);
+        qnet.setInput(blob);
+        Mat out = qnet.forward();
+
+        if (useSoftmax)
+        {
+            LayerParams lp;
+            Net netSoftmax;
+            netSoftmax.addLayerToPrev("softmaxLayer", "Softmax", lp);
+            netSoftmax.setPreferableBackend(DNN_BACKEND_OPENCV);
+
+            netSoftmax.setInput(out);
+            out = netSoftmax.forward();
+
+            netSoftmax.setInput(ref);
+            ref = netSoftmax.forward();
+        }
+
+        normAssert(ref, out, "", l1, lInf);
+    }
+
+    void testDarknetModel(const std::string& cfg, const std::string& weights,
+                          const cv::Mat& ref, double scoreDiff, double iouDiff,
+                          float confThreshold = 0.24, float nmsThreshold = 0.4)
+    {
+        CV_Assert(ref.cols == 7);
+        std::vector<std::vector<int> > refClassIds;
+        std::vector<std::vector<float> > refScores;
+        std::vector<std::vector<Rect2d> > refBoxes;
+        for (int i = 0; i < ref.rows; ++i)
+        {
+            int batchId = static_cast<int>(ref.at<float>(i, 0));
+            int classId = static_cast<int>(ref.at<float>(i, 1));
+            float score = ref.at<float>(i, 2);
+            float left  = ref.at<float>(i, 3);
+            float top   = ref.at<float>(i, 4);
+            float right  = ref.at<float>(i, 5);
+            float bottom = ref.at<float>(i, 6);
+            Rect2d box(left, top, right - left, bottom - top);
+            if (batchId >= refClassIds.size())
+            {
+                refClassIds.resize(batchId + 1);
+                refScores.resize(batchId + 1);
+                refBoxes.resize(batchId + 1);
+            }
+            refClassIds[batchId].push_back(classId);
+            refScores[batchId].push_back(score);
+            refBoxes[batchId].push_back(box);
+        }
+
+        Mat img1 = imread(_tf("dog416.png"));
+        Mat img2 = imread(_tf("street.png"));
+        std::vector<Mat> samples(2);
+        samples[0] = img1; samples[1] = img2;
+
+        // determine test type, whether batch or single img
+        int batch_size = refClassIds.size();
+        CV_Assert(batch_size == 1 || batch_size == 2);
+        samples.resize(batch_size);
+
+        Mat inp = blobFromImages(samples, 1.0/255, Size(416, 416), Scalar(), true, false);
+
+        Net baseNet = readNetFromDarknet(findDataFile("dnn/" + cfg), findDataFile("dnn/" + weights, false));
+        Net qnet = baseNet.quantize(inp, CV_32F, CV_32F);
+        qnet.setPreferableBackend(backend);
+        qnet.setPreferableTarget(target);
+        qnet.setInput(inp);
+        std::vector<Mat> outs;
+        qnet.forward(outs, qnet.getUnconnectedOutLayersNames());
+
+        for (int b = 0; b < batch_size; ++b)
+        {
+            std::vector<int> classIds;
+            std::vector<float> confidences;
+            std::vector<Rect2d> boxes;
+            for (int i = 0; i < outs.size(); ++i)
+            {
+                Mat out;
+                if (batch_size > 1){
+                    // get the sample slice from 3D matrix (batch, box, classes+5)
+                    Range ranges[3] = {Range(b, b+1), Range::all(), Range::all()};
+                    out = outs[i](ranges).reshape(1, outs[i].size[1]);
+                }else{
+                    out = outs[i];
+                }
+                for (int j = 0; j < out.rows; ++j)
+                {
+                    Mat scores = out.row(j).colRange(5, out.cols);
+                    double confidence;
+                    Point maxLoc;
+                    minMaxLoc(scores, 0, &confidence, 0, &maxLoc);
+
+                    if (confidence > confThreshold) {
+                        float* detection = out.ptr<float>(j);
+                        double centerX = detection[0];
+                        double centerY = detection[1];
+                        double width = detection[2];
+                        double height = detection[3];
+                        boxes.push_back(Rect2d(centerX - 0.5 * width, centerY - 0.5 * height,
+                                            width, height));
+                        confidences.push_back(confidence);
+                        classIds.push_back(maxLoc.x);
+                    }
+                }
+            }
+
+            // here we need NMS of boxes
+            std::vector<int> indices;
+            NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
+
+            std::vector<int> nms_classIds;
+            std::vector<float> nms_confidences;
+            std::vector<Rect2d> nms_boxes;
+
+            for (size_t i = 0; i < indices.size(); ++i)
+            {
+                int idx = indices[i];
+                Rect2d box = boxes[idx];
+                float conf = confidences[idx];
+                int class_id = classIds[idx];
+                nms_boxes.push_back(box);
+                nms_confidences.push_back(conf);
+                nms_classIds.push_back(class_id);
+            }
+
+            if (cvIsNaN(iouDiff))
+            {
+                if (b == 0)
+                    std::cout << "Skip accuracy checks" << std::endl;
+                continue;
+            }
+
+            normAssertDetections(refClassIds[b], refScores[b], refBoxes[b], nms_classIds, nms_confidences, nms_boxes,
+                                 format("batch size %d, sample %d\n", batch_size, b).c_str(), confThreshold, scoreDiff, iouDiff);
+        }
+    }
+};
+
+TEST_P(Test_Int8_nets, AlexNet)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+    if (backend != DNN_BACKEND_OPENCV)
+        throw SkipTestException("Only OpenCV backend is supported");
+
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_alexnet.prototxt"),
+                               findDataFile("dnn/bvlc_alexnet.caffemodel", false));
+
+    Mat inp = imread(_tf("grace_hopper_227.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
+
+    float l1 = 1e-4, lInf = 0.003;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, GoogLeNet)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt"),
+                               findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+    std::vector<Mat> inpMats;
+    inpMats.push_back( imread(_tf("googlenet_0.png")) );
+    inpMats.push_back( imread(_tf("googlenet_1.png")) );
+    Mat blob = blobFromImages(inpMats, 1.0, Size(224, 224), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+
+    float l1 = 2e-4, lInf = 0.06;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, ResNet50)
+{
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+    if (backend != DNN_BACKEND_OPENCV)
+        throw SkipTestException("Only OpenCV backend is supported");
+
+    Net net = readNetFromCaffe(findDataFile("dnn/ResNet-50-deploy.prototxt"),
+                               findDataFile("dnn/ResNet-50-model.caffemodel", false));
+
+    Mat inp = imread(_tf("googlenet_0.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(224, 224), Scalar(), false);
+    Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
+
+    float l1 = 3e-4, lInf = 0.035;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, DenseNet121)
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/DenseNet_121.prototxt", false),
+                               findDataFile("dnn/DenseNet_121.caffemodel", false));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0 / 255.0, Size(224, 224), Scalar(), true, true);
+    Mat ref = blobFromNPY(_tf("densenet_121_output.npy"));
+
+    float l1 = 0.76, lInf = 3.31; // seems wrong
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, SqueezeNet_v1_1)
+{
+    if(target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/squeezenet_v1.1.prototxt"),
+                               findDataFile("dnn/squeezenet_v1.1.caffemodel", false));
+
+    Mat inp = imread(_tf("googlenet_0.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(227, 227), Scalar(), false, true);
+    Mat ref = blobFromNPY(_tf("squeezenet_v1.1_prob.npy"));
+
+    float l1 = 3e-4, lInf = 0.056;
+    testClassificationNet(net, blob, ref, l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, CaffeNet)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+    float l1 = 4e-5, lInf = 0.0025;
+    testONNXNet("caffenet", l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, RCNN_ILSVRC13)
+{
+#if defined(OPENCV_32BIT_CONFIGURATION) && (defined(HAVE_OPENCL) || defined(_WIN32))
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
+#else
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#endif
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2019030000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD
+        && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+    float l1 = 0.02, lInf = 0.042;
+    testONNXNet("rcnn_ilsvrc13", l1, lInf);
+}
+
+TEST_P(Test_Int8_nets, Inception_v2)
+{
+    testONNXNet("inception_v2",  default_l1,  default_lInf, true);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v2)
+{
+    testONNXNet("mobilenetv2", default_l1, default_lInf, true);
+}
+
+TEST_P(Test_Int8_nets, Shufflenet)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    }
+    testONNXNet("shufflenet", default_l1, default_lInf);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_SSD)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/MobileNetSSD_deploy.prototxt", false),
+                               findDataFile("dnn/MobileNetSSD_deploy.caffemodel", false));
+
+    Mat inp = imread(_tf("street.png"));
+    Mat blob = blobFromImage(inp, 1.0 / 127.5, Size(300, 300), Scalar(127.5, 127.5, 127.5), false);
+    Mat ref = blobFromNPY(_tf("mobilenet_ssd_caffe_out.npy"));
+
+    float confThreshold = FLT_MIN, scoreDiff = 0.059, iouDiff = 0.11;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v1_SSD)
+{
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false),
+                                    findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.034, iouDiff = 0.13;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, MobileNet_v1_SSD_PPN)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16,
+                     CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pb", false),
+                                    findDataFile("dnn/ssd_mobilenet_v1_ppn_coco.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/ssd_mobilenet_v1_ppn_coco.detection_out.npy"));
+
+    float confThreshold = 0.51, scoreDiff = 0.04, iouDiff = 0.06;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, Inception_v2_SSD)
+{
+    applyTestTag(target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB);
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pb", false),
+                                    findDataFile("dnn/ssd_inception_v2_coco_2017_11_17.pbtxt"));
+
+    Mat inp = imread(_tf("street.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(300, 300), Scalar(), true, false);
+    Mat ref = (Mat_<float>(5, 7) << 0, 1, 0.90176028, 0.19872092, 0.36311883, 0.26461923, 0.63498729,
+                                    0, 3, 0.93569964, 0.64865261, 0.45906419, 0.80675775, 0.65708131,
+                                    0, 3, 0.75838411, 0.44668293, 0.45907149, 0.49459291, 0.52197015,
+                                    0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527,
+                                    0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384);
+
+    float confThreshold = 0.5, scoreDiff = 0.0114, iouDiff = 0.22;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, opencv_face_detector)
+{
+    Net net = readNetFromCaffe(findDataFile("dnn/opencv_face_detector.prototxt"),
+                               findDataFile("dnn/opencv_face_detector.caffemodel", false));
+
+    Mat inp = imread(findDataFile("gpu/lbpcascade/er.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(104.0, 177.0, 123.0), false, false);
+    Mat ref = (Mat_<float>(6, 7) << 0, 1, 0.99520785, 0.80997437, 0.16379407, 0.87996572, 0.26685631,
+                                    0, 1, 0.9934696, 0.2831718, 0.50738752, 0.345781, 0.5985168,
+                                    0, 1, 0.99096733, 0.13629119, 0.24892329, 0.19756334, 0.3310290,
+                                    0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
+                                    0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
+                                    0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
+
+    float confThreshold = 0.5, scoreDiff = 0.002, iouDiff = 0.21;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, EfficientDet)
+{
+    if (target != DNN_TARGET_CPU)
+    {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+    }
+    Net net = readNetFromTensorflow(findDataFile("dnn/efficientdet-d0.pb", false),
+                                    findDataFile("dnn/efficientdet-d0.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0/255, Size(512, 512), Scalar(123.675, 116.28, 103.53));
+    Mat ref = (Mat_<float>(3, 7) << 0, 1, 0.8437444, 0.153996080160141, 0.20534580945968628, 0.7463544607162476, 0.7414066195487976,
+                                    0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541,
+                                    0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661);
+
+    float confThreshold = 0.65, scoreDiff = 0.17, iouDiff = 0.18;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_resnet50)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#ifdef INF_ENGINE_RELEASE
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
+        (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (INF_ENGINE_VER_MAJOR_GT(2019030000) &&
+        backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pb", false),
+                                    findDataFile("dnn/faster_rcnn_resnet50_coco_2018_01_28.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_resnet50_coco_2018_01_28.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.025, iouDiff = 0.15;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_inceptionv2)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#ifdef INF_ENGINE_RELEASE
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
+        (INF_ENGINE_VER_MAJOR_LT(2019020000) || target != DNN_TARGET_CPU))
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (INF_ENGINE_VER_MAJOR_GT(2019030000) &&
+        backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_CUDA && target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromTensorflow(findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pb", false),
+                                    findDataFile("dnn/faster_rcnn_inception_v2_coco_2018_01_28.pbtxt"));
+
+    Mat inp = imread(_tf("dog416.png"));
+    Mat blob = blobFromImage(inp, 1.0, Size(800, 600), Scalar(), true, false);
+    Mat ref = blobFromNPY(_tf("tensorflow/faster_rcnn_inception_v2_coco_2018_01_28.detection_out.npy"));
+
+    float confThreshold = 0.5, scoreDiff = 0.21, iouDiff = 0.1;
+    testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+#endif
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
+        applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+#endif
+
+    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_vgg16.prototxt"),
+                               findDataFile("dnn/VGG16_faster_rcnn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.949398, 99.2454, 210.141, 601.205, 462.849,
+                                    0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
+                                    0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
+
+    float confThreshold = 0.8, scoreDiff = 0.024, iouDiff = 0.35;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, FasterRCNN_zf)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB),
+#endif
+        CV_TEST_TAG_DEBUG_LONG
+    );
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+
+    if (target == DNN_TARGET_CUDA_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_CUDA_FP16);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/faster_rcnn_zf.prototxt"),
+                               findDataFile("dnn/ZF_faster_rcnn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(3, 7) << 0, 2, 0.90121, 120.407, 115.83, 570.586, 528.395,
+                                    0, 7, 0.988779, 469.849, 75.1756, 718.64, 186.762,
+                                    0, 12, 0.967198, 138.588, 206.843, 329.766, 553.176);
+
+    float confThreshold = 0.8, scoreDiff = 0.021, iouDiff = 0.1;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, RFCN)
+{
+    applyTestTag(
+        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_LONG,
+        CV_TEST_TAG_DEBUG_VERYLONG
+    );
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+
+    Net net = readNetFromCaffe(findDataFile("dnn/rfcn_pascal_voc_resnet50.prototxt"),
+                               findDataFile("dnn/resnet50_rfcn_final.caffemodel", false));
+
+    Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
+                                    0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
+
+    float confThreshold = 0.8, scoreDiff = 0.017, iouDiff = 0.11;
+    testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
+}
+
+TEST_P(Test_Int8_nets, YoloVoc)
+{
+    applyTestTag(
+#if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
+        CV_TEST_TAG_MEMORY_2GB,
+#else
+        CV_TEST_TAG_MEMORY_1GB,
+#endif
+        CV_TEST_TAG_LONG
+    );
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
+        target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    Mat ref = (Mat_<float>(6, 7) << 0, 6,  0.750469f, 0.577374f, 0.127391f, 0.902949f, 0.300809f,
+                                    0, 1,  0.780879f, 0.270762f, 0.264102f, 0.732475f, 0.745412f,
+                                    0, 11, 0.901615f, 0.1386f,   0.338509f, 0.421337f, 0.938789f,
+                                    1, 14, 0.623813f, 0.183179f, 0.381921f, 0.247726f, 0.625847f,
+                                    1, 6,  0.667770f, 0.446555f, 0.453578f, 0.499986f, 0.519167f,
+                                    1, 6,  0.844947f, 0.637058f, 0.460398f, 0.828508f, 0.66427f);
+
+    std::string config_file = "yolo-voc.cfg";
+    std::string weights_file = "yolo-voc.weights";
+
+    double scoreDiff = 0.1, iouDiff = 0.3;
+    {
+    SCOPED_TRACE("batch size 1");
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
+    }
+
+    {
+    SCOPED_TRACE("batch size 2");
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, TinyYoloVoc)
+{
+    applyTestTag(CV_TEST_TAG_MEMORY_512MB);
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) &&
+        target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+#endif
+
+    Mat ref = (Mat_<float>(4, 7) << 0, 6,  0.761967f, 0.579042f, 0.159161f, 0.894482f, 0.31994f,
+                                    0, 11, 0.780595f, 0.129696f, 0.386467f, 0.445275f, 0.920994f,
+                                    1, 6,  0.651450f, 0.460526f, 0.458019f, 0.522527f, 0.5341f,
+                                    1, 6,  0.928758f, 0.651024f, 0.463539f, 0.823784f, 0.654998f);
+
+    std::string config_file = "tiny-yolo-voc.cfg";
+    std::string weights_file = "tiny-yolo-voc.weights";
+
+    double scoreDiff = 0.043, iouDiff = 0.12;
+    {
+    SCOPED_TRACE("batch size 1");
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 2), scoreDiff, iouDiff);
+    }
+
+    {
+    SCOPED_TRACE("batch size 2");
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv3)
+{
+    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
+
+    const int N0 = 3;
+    const int N1 = 6;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 16, 0.998836f, 0.160024f, 0.389964f, 0.417885f, 0.943716f,
+0, 1, 0.987908f, 0.150913f, 0.221933f, 0.742255f, 0.746261f,
+0, 7, 0.952983f, 0.614621f, 0.150257f, 0.901368f, 0.289251f,
+
+1, 2, 0.997412f, 0.647584f, 0.459939f, 0.821037f, 0.663947f,
+1, 2, 0.989633f, 0.450719f, 0.463353f, 0.496306f, 0.522258f,
+1, 0, 0.980053f, 0.195856f, 0.378454f, 0.258626f, 0.629257f,
+1, 9, 0.785341f, 0.665503f, 0.373543f, 0.688893f, 0.439244f,
+1, 9, 0.733275f, 0.376029f, 0.315694f, 0.401776f, 0.395165f,
+1, 9, 0.384815f, 0.659824f, 0.372389f, 0.673927f, 0.429412f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov3.cfg";
+    std::string weights_file = "yolov3.weights";
+
+    double scoreDiff = 0.08, iouDiff = 0.21, confThreshold = 0.25;
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+    }
+
+#if defined(INF_ENGINE_RELEASE)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+    {
+        if (target == DNN_TARGET_OPENCL)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000))
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+        else if (target == DNN_TARGET_MYRIAD &&
+                 getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+            applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+    }
+#endif
+
+    {
+        SCOPED_TRACE("batch size 2");
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv4)
+{
+    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    const int N0 = 3;
+    const int N1 = 7;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 16, 0.992194f, 0.172375f, 0.402458f, 0.403918f, 0.932801f,
+0, 1, 0.988326f, 0.166708f, 0.228236f, 0.737208f, 0.735803f,
+0, 7, 0.94639f, 0.602523f, 0.130399f, 0.901623f, 0.298452f,
+
+1, 2, 0.99761f, 0.646556f, 0.45985f, 0.816041f, 0.659067f,
+1, 0, 0.988913f, 0.201726f, 0.360282f, 0.266181f, 0.631728f,
+1, 2, 0.98233f, 0.452007f, 0.462217f, 0.495612f, 0.521687f,
+1, 9, 0.919195f, 0.374642f, 0.316524f, 0.398126f, 0.393714f,
+1, 9, 0.856303f, 0.666842f, 0.372215f, 0.685539f, 0.44141f,
+1, 9, 0.313516f, 0.656791f, 0.374734f, 0.671959f, 0.438371f,
+1, 9, 0.256625f, 0.940232f, 0.326931f, 0.967586f, 0.374002f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov4.cfg";
+    std::string weights_file = "yolov4.weights";
+    double scoreDiff = 0.1, iouDiff = 0.17;
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+    }
+
+    {
+        SCOPED_TRACE("batch size 2");
+
+#if defined(INF_ENGINE_RELEASE)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        {
+            if (target == DNN_TARGET_OPENCL)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000))
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_MYRIAD &&
+                     getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+        }
+#endif
+
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+
+TEST_P(Test_Int8_nets, YOLOv4_tiny)
+{
+    applyTestTag(
+        target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_512MB : CV_TEST_TAG_MEMORY_1GB
+    );
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2021010000)
+    if (target == DNN_TARGET_MYRIAD)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    const float confThreshold = 0.6;
+
+    const int N0 = 2;
+    const int N1 = 3;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 7, 0.85935f, 0.593484f, 0.141211f, 0.920356f, 0.291593f,
+0, 16, 0.795188f, 0.169207f, 0.386886f, 0.423753f, 0.933004f,
+
+1, 2, 0.996832f, 0.653802f, 0.464573f, 0.815193f, 0.653292f,
+1, 2, 0.963325f, 0.451151f, 0.458915f, 0.496255f, 0.52241f,
+1, 0, 0.926244f, 0.194851f, 0.361743f, 0.260277f, 0.632364f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    std::string config_file = "yolov4-tiny.cfg";
+    std::string weights_file = "yolov4-tiny.weights";
+    double scoreDiff = 0.12;
+    double iouDiff = target == DNN_TARGET_OPENCL_FP16 ? 0.2 : 0.082;
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)  // bad accuracy
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        iouDiff = std::numeric_limits<double>::quiet_NaN();
+#endif
+
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, confThreshold);
+    }
+
+    /* bad accuracy on second image
+    {
+        SCOPED_TRACE("batch size 2");
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, confThreshold);
+    }
+    */
+
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)  // bad accuracy
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_Int8_nets, dnnBackendsAndTargets());
+}} // namespace
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index 20d3fb41ebe4..04d5fa63559e 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -445,7 +445,7 @@ class Layer_LSTM_Test : public ::testing::Test
 {
 public:
     int numInp, numOut;
-    Mat Wh, Wx, b;
+    Mat Wh, Wx, b, h, c;
     Ptr<LSTMLayer> layer;
     std::vector<Mat> inputs, outputs;
 
@@ -460,12 +460,17 @@ class Layer_LSTM_Test : public ::testing::Test
         Wh = Mat::ones(4 * numOut, numOut, CV_32F);
         Wx = Mat::ones(4 * numOut, numInp, CV_32F);
         b  = Mat::ones(4 * numOut, 1, CV_32F);
+        h  = Mat::ones(4, numOut, CV_32F);
+        c  = Mat::ones(4, numOut, CV_32F);
 
         LayerParams lp;
-        lp.blobs.resize(3);
+        lp.blobs.resize(5);
         lp.blobs[0] = Wh;
         lp.blobs[1] = Wx;
         lp.blobs[2] = b;
+        lp.blobs[3] = h;
+        lp.blobs[4] = c;
+
         lp.set<bool>("produce_cell_output", produceCellOutput);
         lp.set<bool>("use_timestamp_dim", useTimestampDim);
 
@@ -513,10 +518,12 @@ TEST_F(Layer_LSTM_Test, get_set_test)
 TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
 {
     LayerParams lp;
-    lp.blobs.resize(3);
+    lp.blobs.resize(5);
     lp.blobs[0] = blobFromNPY(_tf("lstm.prototxt.w_2.npy"));  // Wh
     lp.blobs[1] = blobFromNPY(_tf("lstm.prototxt.w_0.npy"));  // Wx
     lp.blobs[2] = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));  // bias
+    lp.blobs[3] = Mat::zeros(2, 17, CV_32F);                     // h_0
+    lp.blobs[4] = Mat::zeros(2, 17, CV_32F);                     // c_0
     Ptr<LSTMLayer> layer = LSTMLayer::create(lp);
 
     Mat inp = blobFromNPY(_tf("recurrent.input.npy"));
@@ -527,6 +534,97 @@ TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
     normAssert(h_t_reference, outputs[0]);
 }
 
+TEST(Layer_LSTM_Test_Accuracy_with_, HiddenParams)
+{
+    Mat Wx = blobFromNPY(_tf("lstm.hidden.W.npy"));
+    Mat Wh = blobFromNPY(_tf("lstm.hidden.R.npy"));
+    Mat b = blobFromNPY(_tf("lstm.hidden.B.npy"));
+    Mat h0 = blobFromNPY(_tf("lstm.hidden.h0.npy"));
+    Mat c0 = blobFromNPY(_tf("lstm.hidden.c0.npy"));
+
+    const int numHidden = 3;
+    const int numDirs = Wx.size[0];
+    const int numFeatures = Wx.size[2];
+
+    b = b.reshape(1, b.size[0]);
+    Mat bx = b.colRange(0, b.cols / 2);
+    Mat bh = b.colRange(b.cols / 2, b.cols);
+    b = bx + bh;
+
+    // IFGO->IGFO
+    for (int k = 0; k < numDirs; ++k)
+    {
+        float* WxData = Wx.ptr<float>(k);
+        float* WhData = Wh.ptr<float>(k);
+        float* biasData = b.ptr<float>(k);
+        for (int j = 0; j < numHidden; ++j)
+        {
+            for (int i = 0; i < numFeatures; ++i)
+            {
+                std::swap(WxData[(numHidden + j) * numFeatures + i],
+                          WxData[(numHidden * 2 + j) * numFeatures + i]);
+            }
+            for (int i = 0; i < numHidden; ++i)
+            {
+                std::swap(WhData[(numHidden + j) * numHidden + i],
+                          WhData[(numHidden * 2 + j) * numHidden + i]);
+            }
+            std::swap(biasData[numHidden + j], biasData[numHidden * 2 + j]);
+        }
+    }
+
+    Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+    Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
+    h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+    c0 = c0.reshape(1, c0.size[0] * c0.size[1]);
+
+    LayerParams lstmParams;
+    lstmParams.blobs.resize(5);
+    lstmParams.blobs[0] = Wh;
+    lstmParams.blobs[1] = Wx;
+    lstmParams.blobs[2] = b;
+    lstmParams.blobs[3] = h0;
+    lstmParams.blobs[4] = c0;
+    lstmParams.set("bidirectional", false);
+    Ptr<LSTMLayer> layer = LSTMLayer::create(lstmParams);
+
+    Mat inp = blobFromNPY(_tf("lstm.hidden.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+
+    Mat h_t_reference = blobFromNPY(_tf("lstm.hidden.output.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
+TEST(Layer_GRU_Test_Accuracy_with_, Pytorch)
+{
+    Mat Wx = blobFromNPY(_tf("gru.W.npy"));
+    Mat Wh = blobFromNPY(_tf("gru.R.npy"));
+    Mat b = blobFromNPY(_tf("gru.B.npy"));
+    Mat h0 = blobFromNPY(_tf("gru.h0.npy"));
+
+    Wx = Wx.reshape(1, Wx.size[0] * Wx.size[1]);
+    Wh = Wh.reshape(1, Wh.size[0] * Wh.size[1]);
+    h0 = h0.reshape(1, h0.size[0] * h0.size[1]);
+    b = b.reshape(1, b.size[0]);
+
+    LayerParams gruParams;
+    gruParams.blobs.resize(4);
+    gruParams.blobs[0] = Wh;
+    gruParams.blobs[1] = Wx;
+    gruParams.blobs[2] = b;
+    gruParams.blobs[3] = h0;
+    gruParams.set("bidirectional", false);
+    Ptr<GRULayer> layer = GRULayer::create(gruParams);
+
+    Mat inp = blobFromNPY(_tf("gru.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
+    runLayer(layer, inputs, outputs);
+
+    Mat h_t_reference = blobFromNPY(_tf("gru.output.npy"));
+    normAssert(h_t_reference, outputs[0]);
+}
+
 TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
 {
     Ptr<RNNLayer> layer = RNNLayer::create(LayerParams());
@@ -571,6 +669,9 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse)
     bias.at<float>(2, 0) = 1e10f;  // Output gate - always output everything
     bias.at<float>(3, 0) = 0.f;  // Update signal
 
+    cv::Mat hInternal = cv::Mat::zeros(1, 1, CV_32FC1);
+    cv::Mat cInternal = cv::Mat::zeros(1, 1, CV_32FC1);
+
     LayerParams lp;
     lp.set("reverse", true);
     lp.set("use_timestamp_dim", true);
@@ -578,6 +679,8 @@ TEST(Layer_LSTM_Test_Accuracy_, Reverse)
     lp.blobs.push_back(Wh);
     lp.blobs.push_back(Wx);
     lp.blobs.push_back(bias);
+    lp.blobs.push_back(hInternal);
+    lp.blobs.push_back(cInternal);
 
     cv::Ptr<cv::dnn::LSTMLayer> layer = LSTMLayer::create(lp);
     std::vector<cv::Mat> outputs;
diff --git a/modules/dnn/test/test_model.cpp b/modules/dnn/test/test_model.cpp
index f7befa9937ae..6ac9702c6993 100644
--- a/modules/dnn/test/test_model.cpp
+++ b/modules/dnn/test/test_model.cpp
@@ -615,6 +615,25 @@ TEST_P(Test_Model, TextRecognition)
     testTextRecognitionModel(weightPath, "", imgPath, seq, decodeType, vocabulary, size, mean, scale);
 }
 
+TEST_P(Test_Model, TextRecognitionWithCTCPrefixBeamSearch)
+{
+    if (target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+
+    std::string imgPath = _tf("text_rec_test.png");
+    std::string weightPath = _tf("onnx/models/crnn.onnx", false);
+    std::string seq = "welcome";
+
+    Size size{100, 32};
+    double scale = 1.0 / 127.5;
+    Scalar mean = Scalar(127.5);
+    std::string decodeType = "CTC-prefix-beam-search";
+    std::vector<std::string> vocabulary = {"0","1","2","3","4","5","6","7","8","9",
+                                           "a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"};
+
+    testTextRecognitionModel(weightPath, "", imgPath, seq, decodeType, vocabulary, size, mean, scale);
+}
+
 TEST_P(Test_Model, TextDetectionByDB)
 {
     if (target == DNN_TARGET_OPENCL_FP16)
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 22a504df69ad..5dfe9ca30c90 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -112,6 +112,7 @@ TEST_P(Test_ONNX_layers, MaxPooling_2)
 TEST_P(Test_ONNX_layers, Convolution)
 {
     testONNXModels("convolution");
+    testONNXModels("conv_asymmetric_pads");
 }
 
 TEST_P(Test_ONNX_layers, Convolution_variable_weight)
@@ -349,6 +350,7 @@ TEST_P(Test_ONNX_layers, Concatenation)
         if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
     }
     testONNXModels("concatenation");
+    testONNXModels("concat_const_blobs");
 }
 
 TEST_P(Test_ONNX_layers, Eltwise3D)
@@ -650,6 +652,7 @@ TEST_P(Test_ONNX_layers, Split)
     testONNXModels("split_2");
     testONNXModels("split_3");
     testONNXModels("split_4");
+    testONNXModels("split_sizes");
 }
 
 TEST_P(Test_ONNX_layers, Slice)
@@ -698,6 +701,11 @@ TEST_P(Test_ONNX_layers, Split_EltwiseMax)
     testONNXModels("split_max");
 }
 
+TEST_P(Test_ONNX_layers, LSTM_Activations)
+{
+    testONNXModels("lstm_cntk_tanh", pb, 0, 0, false, false);
+}
+
 TEST_P(Test_ONNX_layers, LSTM)
 {
     testONNXModels("lstm", npy, 0, 0, false, false);
@@ -708,6 +716,26 @@ TEST_P(Test_ONNX_layers, LSTM_bidirectional)
     testONNXModels("lstm_bidirectional", npy, 0, 0, false, false);
 }
 
+TEST_P(Test_ONNX_layers, LSTM_hidden)
+{
+    testONNXModels("hidden_lstm", npy, 0, 0, false, false);
+}
+
+TEST_P(Test_ONNX_layers, LSTM_hidden_bidirectional)
+{
+    testONNXModels("hidden_lstm_bi", npy, 0, 0, false, false);
+}
+
+TEST_P(Test_ONNX_layers, GRU)
+{
+    testONNXModels("gru", npy, 0, 0, false, false);
+}
+
+TEST_P(Test_ONNX_layers, GRU_bidirectional)
+{
+    testONNXModels("gru_bi", npy, 0, 0, false, false);
+}
+
 TEST_P(Test_ONNX_layers, Pad2d_Unfused)
 {
     testONNXModels("ReflectionPad2d");
@@ -1335,6 +1363,15 @@ TEST_P(Test_ONNX_nets, Resnet34_kinetics)
     expectNoFallbacksFromIE(net);
 }
 
+TEST_P(Test_ONNX_layers, CumSum)
+{
+    testONNXModels("cumsum_1d_exclusive_1");
+    testONNXModels("cumsum_1d_reverse");
+    testONNXModels("cumsum_1d_exclusive_1_reverse");
+    testONNXModels("cumsum_2d_dim_1");
+    testONNXModels("cumsum_3d_dim_2");
+}
+
 INSTANTIATE_TEST_CASE_P(/**/, Test_ONNX_nets, dnnBackendsAndTargets());
 
 }} // namespace
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 2c3613472451..3d53ced0a450 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -13,6 +13,7 @@ Test for Tensorflow models loading
 #include "npy_blob.hpp"
 
 #include <opencv2/dnn/layer.details.hpp>  // CV_DNN_REGISTER_LAYER_CLASS
+#include <opencv2/dnn/utils/debug_utils.hpp>
 
 namespace opencv_test
 {
@@ -128,6 +129,13 @@ TEST_P(Test_TensorFlow_layers, reduce_mean)
     runTensorFlowNet("global_pool_by_axis");
 }
 
+TEST_P(Test_TensorFlow_layers, reduce_max)
+{
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);
+    runTensorFlowNet("max_pool_by_axis");
+}
+
 TEST_P(Test_TensorFlow_layers, reduce_sum)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@@ -135,11 +143,21 @@ TEST_P(Test_TensorFlow_layers, reduce_sum)
     runTensorFlowNet("sum_pool_by_axis");
 }
 
+TEST_P(Test_TensorFlow_layers, reduce_max_channel)
+{
+    runTensorFlowNet("reduce_max_channel");
+}
+
 TEST_P(Test_TensorFlow_layers, reduce_sum_channel)
 {
     runTensorFlowNet("reduce_sum_channel");
 }
 
+TEST_P(Test_TensorFlow_layers, reduce_max_channel_keep_dims)
+{
+    runTensorFlowNet("reduce_max_channel", false, 0.0, 0.0, false, "_keep_dims");
+}
+
 TEST_P(Test_TensorFlow_layers, reduce_sum_channel_keep_dims)
 {
     runTensorFlowNet("reduce_sum_channel", false, 0.0, 0.0, false, "_keep_dims");
@@ -203,6 +221,16 @@ TEST_P(Test_TensorFlow_layers, padding)
     runTensorFlowNet("keras_pad_concat");
 }
 
+TEST_P(Test_TensorFlow_layers, padding_asymmetric)
+{
+    runTensorFlowNet("conv2d_asymmetric_pads_nchw");
+    runTensorFlowNet("conv2d_asymmetric_pads_nhwc");
+    runTensorFlowNet("max_pool2d_asymmetric_pads_nchw");
+    runTensorFlowNet("max_pool2d_asymmetric_pads_nhwc");
+    runTensorFlowNet("conv2d_backprop_input_asymmetric_pads_nchw");
+    runTensorFlowNet("conv2d_backprop_input_asymmetric_pads_nhwc");
+}
+
 TEST_P(Test_TensorFlow_layers, padding_same)
 {
     // Reference output values are in range [0.0006, 2.798]
@@ -385,6 +413,11 @@ TEST_P(Test_TensorFlow_layers, pooling_reduce_mean)
     runTensorFlowNet("reduce_mean");  // an average pooling over all spatial dimensions.
 }
 
+TEST_P(Test_TensorFlow_layers, pooling_reduce_max)
+{
+    runTensorFlowNet("reduce_max");  // a MAX pooling over all spatial dimensions.
+}
+
 TEST_P(Test_TensorFlow_layers, pooling_reduce_sum)
 {
     runTensorFlowNet("reduce_sum");  // a SUM pooling over all spatial dimensions.
@@ -568,6 +601,41 @@ TEST_P(Test_TensorFlow_layers, l2_normalize_3d)
     runTensorFlowNet("l2_normalize_3d");
 }
 
+class Test_TensorFlow_diagnostics : public DNNTestLayer {
+public:
+    Test_TensorFlow_diagnostics()
+    {
+        enableModelDiagnostics(true);
+        skipModelImport(true);
+    }
+
+    ~Test_TensorFlow_diagnostics()
+    {
+        enableModelDiagnostics(false);
+        skipModelImport(false);
+    }
+
+    void runFailingTensorFlowNet(const std::string& prefix, bool hasText = false)
+    {
+        std::string netPath = path(prefix + "_net.pb");
+        std::string netConfig = (hasText ? path(prefix + "_net.pbtxt") : "");
+
+        Net net = readNetFromTensorflow(netPath, netConfig);
+    }
+};
+
+TEST_P(Test_TensorFlow_diagnostics, not_implemented_layer)
+{
+    runFailingTensorFlowNet("not_implemented_layer");
+}
+
+TEST_P(Test_TensorFlow_diagnostics, broken_parameters)
+{
+    runFailingTensorFlowNet("broken_layer");
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Test_TensorFlow_diagnostics, dnnBackendsAndTargets());
+
 class Test_TensorFlow_nets : public DNNTestLayer {};
 
 TEST_P(Test_TensorFlow_nets, MobileNet_SSD)
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index f1d636895baa..48725f244e57 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -290,9 +290,14 @@ TEST_P(Test_Torch_layers, net_padding)
 
 TEST_P(Test_Torch_layers, net_non_spatial)
 {
-#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2021030000)
+#if defined(INF_ENGINE_RELEASE) && ( \
+    INF_ENGINE_VER_MAJOR_EQ(2021030000) || \
+    INF_ENGINE_VER_MAJOR_EQ(2021040000) \
+)
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_MYRIAD)
-        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // crash
+        // 2021.3: crash
+        // 2021.4: [ GENERAL_ERROR ]  AssertionFailed: !out.networkInputs.empty()
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // exception
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 16016082a2eb..e51bd9204491 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -1337,6 +1337,13 @@ CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& key
                              const std::vector<char>& matchesMask=std::vector<char>(), DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
 
 /** @overload */
+CV_EXPORTS_W void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                             InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                             const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                             const int matchesThickness, const Scalar& matchColor=Scalar::all(-1),
+                             const Scalar& singlePointColor=Scalar::all(-1), const std::vector<char>& matchesMask=std::vector<char>(),
+                             DrawMatchesFlags flags=DrawMatchesFlags::DEFAULT );
+
 CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
                              InputArray img2, const std::vector<KeyPoint>& keypoints2,
                              const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
diff --git a/modules/features2d/src/draw.cpp b/modules/features2d/src/draw.cpp
index 84fb0aca39ab..86f06dc8858d 100644
--- a/modules/features2d/src/draw.cpp
+++ b/modules/features2d/src/draw.cpp
@@ -183,7 +183,8 @@ static void _prepareImgAndDrawKeypoints( InputArray img1, const std::vector<KeyP
 }
 
 static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1, InputOutputArray outImg2 ,
-                          const KeyPoint& kp1, const KeyPoint& kp2, const Scalar& matchColor, DrawMatchesFlags flags )
+                          const KeyPoint& kp1, const KeyPoint& kp2, const Scalar& matchColor, DrawMatchesFlags flags,
+                          const int matchesThickness )
 {
     RNG& rng = theRNG();
     bool isRandMatchColor = matchColor == Scalar::all(-1);
@@ -199,7 +200,7 @@ static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1
     line( outImg,
           Point(cvRound(pt1.x*draw_multiplier), cvRound(pt1.y*draw_multiplier)),
           Point(cvRound(dpt2.x*draw_multiplier), cvRound(dpt2.y*draw_multiplier)),
-          color, 1, LINE_AA, draw_shift_bits );
+          color, matchesThickness, LINE_AA, draw_shift_bits );
 }
 
 void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
@@ -207,6 +208,21 @@ void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
                   const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
                   const Scalar& matchColor, const Scalar& singlePointColor,
                   const std::vector<char>& matchesMask, DrawMatchesFlags flags )
+{
+    drawMatches( img1, keypoints1,
+                 img2, keypoints2,
+                 matches1to2, outImg,
+                 1, matchColor,
+                 singlePointColor, matchesMask,
+                 flags);
+}
+
+void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
+                  InputArray img2, const std::vector<KeyPoint>& keypoints2,
+                  const std::vector<DMatch>& matches1to2, InputOutputArray outImg,
+                  const int matchesThickness, const Scalar& matchColor,
+                  const Scalar& singlePointColor, const std::vector<char>& matchesMask,
+                  DrawMatchesFlags flags )
 {
     if( !matchesMask.empty() && matchesMask.size() != matches1to2.size() )
         CV_Error( Error::StsBadSize, "matchesMask must have the same size as matches1to2" );
@@ -226,11 +242,12 @@ void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
             CV_Assert(i2 >= 0 && i2 < static_cast<int>(keypoints2.size()));
 
             const KeyPoint &kp1 = keypoints1[i1], &kp2 = keypoints2[i2];
-            _drawMatch( outImg, outImg1, outImg2, kp1, kp2, matchColor, flags );
+            _drawMatch( outImg, outImg1, outImg2, kp1, kp2, matchColor, flags, matchesThickness );
         }
     }
 }
 
+
 void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
                   InputArray img2, const std::vector<KeyPoint>& keypoints2,
                   const std::vector<std::vector<DMatch> >& matches1to2, InputOutputArray outImg,
@@ -254,7 +271,7 @@ void drawMatches( InputArray img1, const std::vector<KeyPoint>& keypoints1,
             if( matchesMask.empty() || matchesMask[i][j] )
             {
                 const KeyPoint &kp1 = keypoints1[i1], &kp2 = keypoints2[i2];
-                _drawMatch( outImg, outImg1, outImg2, kp1, kp2, matchColor, flags );
+                _drawMatch( outImg, outImg1, outImg2, kp1, kp2, matchColor, flags, 1 );
             }
         }
     }
diff --git a/modules/features2d/src/sift.simd.hpp b/modules/features2d/src/sift.simd.hpp
index b5033459b957..60129b1535b5 100644
--- a/modules/features2d/src/sift.simd.hpp
+++ b/modules/features2d/src/sift.simd.hpp
@@ -450,31 +450,184 @@ class findScaleSpaceExtremaT
             const sift_wt* currptr = img.ptr<sift_wt>(r);
             const sift_wt* prevptr = prev.ptr<sift_wt>(r);
             const sift_wt* nextptr = next.ptr<sift_wt>(r);
+            int c = SIFT_IMG_BORDER;
 
-            for( int c = SIFT_IMG_BORDER; c < cols-SIFT_IMG_BORDER; c++)
+#if CV_SIMD && !(DoG_TYPE_SHORT)
+            const int vecsize = v_float32::nlanes;
+            for( ; c <= cols-SIFT_IMG_BORDER - vecsize; c += vecsize)
+            {
+                v_float32 val = vx_load(&currptr[c]);
+                v_float32 _00,_01,_02;
+                v_float32 _10,    _12;
+                v_float32 _20,_21,_22;
+
+                v_float32 vmin,vmax;
+
+
+                v_float32 cond = v_abs(val) > vx_setall_f32((float)threshold);
+                if (!v_check_any(cond))
+                {
+                    continue;
+                }
+
+                _00 = vx_load(&currptr[c-step-1]); _01 = vx_load(&currptr[c-step]); _02 = vx_load(&currptr[c-step+1]);
+                _10 = vx_load(&currptr[c     -1]);                                  _12 = vx_load(&currptr[c     +1]);
+                _20 = vx_load(&currptr[c+step-1]); _21 = vx_load(&currptr[c+step]); _22 = vx_load(&currptr[c+step+1]);
+
+                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
+                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
+
+                v_float32 condp = cond & (val > vx_setall_f32(0)) & (val >= vmax);
+                v_float32 condm = cond & (val < vx_setall_f32(0)) & (val <= vmin);
+
+                cond = condp | condm;
+                if (!v_check_any(cond))
+                {
+                    continue;
+                }
+
+                _00 = vx_load(&prevptr[c-step-1]); _01 = vx_load(&prevptr[c-step]); _02 = vx_load(&prevptr[c-step+1]);
+                _10 = vx_load(&prevptr[c     -1]);                                  _12 = vx_load(&prevptr[c     +1]);
+                _20 = vx_load(&prevptr[c+step-1]); _21 = vx_load(&prevptr[c+step]); _22 = vx_load(&prevptr[c+step+1]);
+
+                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
+                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
+
+                condp &= (val >= vmax);
+                condm &= (val <= vmin);
+
+                cond = condp | condm;
+                if (!v_check_any(cond))
+                {
+                    continue;
+                }
+
+                v_float32 _11p = vx_load(&prevptr[c]);
+                v_float32 _11n = vx_load(&nextptr[c]);
+
+                v_float32 max_middle = v_max(_11n,_11p);
+                v_float32 min_middle = v_min(_11n,_11p);
+
+                _00 = vx_load(&nextptr[c-step-1]); _01 = vx_load(&nextptr[c-step]); _02 = vx_load(&nextptr[c-step+1]);
+                _10 = vx_load(&nextptr[c     -1]);                                  _12 = vx_load(&nextptr[c     +1]);
+                _20 = vx_load(&nextptr[c+step-1]); _21 = vx_load(&nextptr[c+step]); _22 = vx_load(&nextptr[c+step+1]);
+
+                vmax = v_max(v_max(v_max(_00,_01),v_max(_02,_10)),v_max(v_max(_12,_20),v_max(_21,_22)));
+                vmin = v_min(v_min(v_min(_00,_01),v_min(_02,_10)),v_min(v_min(_12,_20),v_min(_21,_22)));
+
+                condp &= (val >= v_max(vmax,max_middle));
+                condm &= (val <= v_min(vmin,min_middle));
+
+                cond = condp | condm;
+                if (!v_check_any(cond))
+                {
+                    continue;
+                }
+
+                int mask = v_signmask(cond);
+                for (int k = 0; k<vecsize;k++)
+                {
+                    if ((mask & (1<<k)) == 0)
+                        continue;
+
+                    CV_TRACE_REGION("pixel_candidate_simd");
+
+                    KeyPoint kpt;
+                    int r1 = r, c1 = c+k, layer = i;
+                    if( !adjustLocalExtrema(dog_pyr, kpt, o, layer, r1, c1,
+                                            nOctaveLayers, (float)contrastThreshold,
+                                            (float)edgeThreshold, (float)sigma) )
+                        continue;
+                    float scl_octv = kpt.size*0.5f/(1 << o);
+                    float omax = calcOrientationHist(gauss_pyr[o*(nOctaveLayers+3) + layer],
+                                                     Point(c1, r1),
+                                                     cvRound(SIFT_ORI_RADIUS * scl_octv),
+                                                     SIFT_ORI_SIG_FCTR * scl_octv,
+                                                     hist, n);
+                    float mag_thr = (float)(omax * SIFT_ORI_PEAK_RATIO);
+                    for( int j = 0; j < n; j++ )
+                    {
+                        int l = j > 0 ? j - 1 : n - 1;
+                        int r2 = j < n-1 ? j + 1 : 0;
+
+                        if( hist[j] > hist[l]  &&  hist[j] > hist[r2]  &&  hist[j] >= mag_thr )
+                        {
+                            float bin = j + 0.5f * (hist[l]-hist[r2]) / (hist[l] - 2*hist[j] + hist[r2]);
+                            bin = bin < 0 ? n + bin : bin >= n ? bin - n : bin;
+                            kpt.angle = 360.f - (float)((360.f/n) * bin);
+                            if(std::abs(kpt.angle - 360.f) < FLT_EPSILON)
+                                kpt.angle = 0.f;
+
+                            kpts_.push_back(kpt);
+                        }
+                    }
+                }
+            }
+
+#endif //CV_SIMD && !(DoG_TYPE_SHORT)
+
+            // vector loop reminder, better predictibility and less branch density
+            for( ; c < cols-SIFT_IMG_BORDER; c++)
             {
                 sift_wt val = currptr[c];
+                if (std::abs(val) <= threshold)
+                    continue;
+
+                sift_wt _00,_01,_02;
+                sift_wt _10,    _12;
+                sift_wt _20,_21,_22;
+                _00 = currptr[c-step-1]; _01 = currptr[c-step]; _02 = currptr[c-step+1];
+                _10 = currptr[c     -1];                        _12 = currptr[c     +1];
+                _20 = currptr[c+step-1]; _21 = currptr[c+step]; _22 = currptr[c+step+1];
+
+                bool calculate = false;
+                if (val > 0)
+                {
+                    sift_wt vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
+                    if (val >= vmax)
+                    {
+                        _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
+                        _10 = prevptr[c     -1];                        _12 = prevptr[c     +1];
+                        _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
+                        vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
+                        if (val >= vmax)
+                        {
+                            _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
+                            _10 = nextptr[c     -1];                        _12 = nextptr[c     +1];
+                            _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
+                            vmax = std::max(std::max(std::max(_00,_01),std::max(_02,_10)),std::max(std::max(_12,_20),std::max(_21,_22)));
+                            if (val >= vmax)
+                            {
+                                sift_wt _11p = prevptr[c], _11n = nextptr[c];
+                                calculate = (val >= std::max(_11p,_11n));
+                            }
+                        }
+                    }
+
+                } else  { // val cant be zero here (first abs took care of zero), must be negative
+                    sift_wt vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
+                    if (val <= vmin)
+                    {
+                        _00 = prevptr[c-step-1]; _01 = prevptr[c-step]; _02 = prevptr[c-step+1];
+                        _10 = prevptr[c     -1];                        _12 = prevptr[c     +1];
+                        _20 = prevptr[c+step-1]; _21 = prevptr[c+step]; _22 = prevptr[c+step+1];
+                        vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
+                        if (val <= vmin)
+                        {
+                            _00 = nextptr[c-step-1]; _01 = nextptr[c-step]; _02 = nextptr[c-step+1];
+                            _10 = nextptr[c     -1];                        _12 = nextptr[c     +1];
+                            _20 = nextptr[c+step-1]; _21 = nextptr[c+step]; _22 = nextptr[c+step+1];
+                            vmin = std::min(std::min(std::min(_00,_01),std::min(_02,_10)),std::min(std::min(_12,_20),std::min(_21,_22)));
+                            if (val <= vmin)
+                            {
+                                sift_wt _11p = prevptr[c], _11n = nextptr[c];
+                                calculate = (val <= std::min(_11p,_11n));
+                            }
+                        }
+                    }
+                }
 
-                // find local extrema with pixel accuracy
-                if( std::abs(val) > threshold &&
-                   ((val > 0 && val >= currptr[c-1] && val >= currptr[c+1] &&
-                     val >= currptr[c-step-1] && val >= currptr[c-step] && val >= currptr[c-step+1] &&
-                     val >= currptr[c+step-1] && val >= currptr[c+step] && val >= currptr[c+step+1] &&
-                     val >= nextptr[c] && val >= nextptr[c-1] && val >= nextptr[c+1] &&
-                     val >= nextptr[c-step-1] && val >= nextptr[c-step] && val >= nextptr[c-step+1] &&
-                     val >= nextptr[c+step-1] && val >= nextptr[c+step] && val >= nextptr[c+step+1] &&
-                     val >= prevptr[c] && val >= prevptr[c-1] && val >= prevptr[c+1] &&
-                     val >= prevptr[c-step-1] && val >= prevptr[c-step] && val >= prevptr[c-step+1] &&
-                     val >= prevptr[c+step-1] && val >= prevptr[c+step] && val >= prevptr[c+step+1]) ||
-                    (val < 0 && val <= currptr[c-1] && val <= currptr[c+1] &&
-                     val <= currptr[c-step-1] && val <= currptr[c-step] && val <= currptr[c-step+1] &&
-                     val <= currptr[c+step-1] && val <= currptr[c+step] && val <= currptr[c+step+1] &&
-                     val <= nextptr[c] && val <= nextptr[c-1] && val <= nextptr[c+1] &&
-                     val <= nextptr[c-step-1] && val <= nextptr[c-step] && val <= nextptr[c-step+1] &&
-                     val <= nextptr[c+step-1] && val <= nextptr[c+step] && val <= nextptr[c+step+1] &&
-                     val <= prevptr[c] && val <= prevptr[c-1] && val <= prevptr[c+1] &&
-                     val <= prevptr[c-step-1] && val <= prevptr[c-step] && val <= prevptr[c-step+1] &&
-                     val <= prevptr[c+step-1] && val <= prevptr[c+step] && val <= prevptr[c+step+1])))
+                if (calculate)
                 {
                     CV_TRACE_REGION("pixel_candidate");
 
diff --git a/modules/flann/include/opencv2/flann/heap.h b/modules/flann/include/opencv2/flann/heap.h
index ee1c682cfe98..8cace2044973 100644
--- a/modules/flann/include/opencv2/flann/heap.h
+++ b/modules/flann/include/opencv2/flann/heap.h
@@ -36,9 +36,21 @@
 #include <algorithm>
 #include <vector>
 
+#include <unordered_map>
+
 namespace cvflann
 {
 
+// TODO: Define x > y operator and use std::greater<T> instead
+template <typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return y < x;
+    }
+};
+
 /**
  * Priority Queue Implementation
  *
@@ -49,117 +61,180 @@ namespace cvflann
 template <typename T>
 class Heap
 {
-
     /**
      * Storage array for the heap.
      * Type T must be comparable.
      */
     std::vector<T> heap;
-    int length;
-
+public:
     /**
-     * Number of element in the heap
+     * \brief Constructs a heap with a pre-allocated capacity
+     *
+     * \param capacity heap maximum capacity
      */
-    int count;
-
-
+    Heap(const int capacity)
+    {
+        reserve(capacity);
+    }
 
-public:
     /**
-     * Constructor.
+     * \brief Move-constructs a heap from an external vector
      *
-     * Params:
-     *     sz = heap size
+     * \param vec external vector
      */
+    Heap(std::vector<T>&& vec)
+        : heap(std::move(vec))
+    {
+        std::make_heap(heap.begin(), heap.end(), greater<T>());
+    }
 
-    Heap(int sz)
+    /**
+     *
+     * \returns heap size
+     */
+    int size() const
     {
-        length = sz;
-        heap.reserve(length);
-        count = 0;
+        return (int)heap.size();
     }
 
     /**
      *
-     * Returns: heap size
+     * \returns heap capacity
      */
-    int size()
+    int capacity() const
     {
-        return count;
+        return (int)heap.capacity();
     }
 
     /**
-     * Tests if the heap is empty
+     * \brief Tests if the heap is empty
      *
-     * Returns: true is heap empty, false otherwise
+     * \returns true is heap empty, false otherwise
      */
     bool empty()
     {
-        return size()==0;
+        return heap.empty();
     }
 
     /**
-     * Clears the heap.
+     * \brief Clears the heap.
      */
     void clear()
     {
         heap.clear();
-        count = 0;
     }
 
-    struct CompareT
+    /**
+     * \brief Sets the heap maximum capacity.
+     *
+     * \param capacity heap maximum capacity
+     */
+    void reserve(const int capacity)
     {
-        bool operator()(const T& t_1, const T& t_2) const
-        {
-            return t_2 < t_1;
-        }
-    };
+        heap.reserve(capacity);
+    }
 
     /**
-     * Insert a new element in the heap.
+     * \brief Inserts a new element in the heap.
      *
      * We select the next empty leaf node, and then keep moving any larger
      * parents down until the right location is found to store this element.
      *
-     * Params:
-     *     value = the new element to be inserted in the heap
+     * \param value the new element to be inserted in the heap
      */
     void insert(T value)
     {
         /* If heap is full, then return without adding this element. */
-        if (count == length) {
+        if (size() == capacity()) {
             return;
         }
 
         heap.push_back(value);
-        static CompareT compareT;
-        std::push_heap(heap.begin(), heap.end(), compareT);
-        ++count;
+        std::push_heap(heap.begin(), heap.end(), greater<T>());
     }
 
-
-
     /**
-     * Returns the node of minimum value from the heap (top of the heap).
+     * \brief Returns the node of minimum value from the heap (top of the heap).
      *
-     * Params:
-     *     value = out parameter used to return the min element
-     * Returns: false if heap empty
+     * \param[out] value parameter used to return the min element
+     * \returns false if heap empty
      */
     bool popMin(T& value)
     {
-        if (count == 0) {
+        if (empty()) {
             return false;
         }
 
         value = heap[0];
-        static CompareT compareT;
-        std::pop_heap(heap.begin(), heap.end(), compareT);
+        std::pop_heap(heap.begin(), heap.end(), greater<T>());
         heap.pop_back();
-        --count;
 
         return true;  /* Return old last node. */
     }
+
+    /**
+     * \brief Returns a shared heap for the given memory pool ID.
+     *
+     * It constructs the heap if it does not already exists.
+     *
+     * \param poolId a user-chosen hashable ID for identifying the heap.
+     *     For thread-safe operations, using current thread ID is a good choice.
+     * \param capacity heap maximum capacity
+     * \param iterThreshold remove heaps that were not reused for more than specified iterations count
+     *        if iterThreshold value is less 2, it will be internally adjusted to twice the number of CPU threads
+     * \returns pointer to the heap
+     */
+    template <typename HashableT>
+    static cv::Ptr<Heap<T>> getPooledInstance(
+        const HashableT& poolId, const int capacity, int iterThreshold = 0)
+    {
+        static cv::Mutex mutex;
+        const cv::AutoLock lock(mutex);
+
+        struct HeapMapValueType {
+            cv::Ptr<Heap<T>> heapPtr;
+            int iterCounter;
+        };
+        typedef std::unordered_map<HashableT, HeapMapValueType> HeapMapType;
+
+        static HeapMapType heapsPool;
+        typename HeapMapType::iterator heapIt = heapsPool.find(poolId);
+
+        if (heapIt == heapsPool.end())
+        {
+            // Construct the heap as it does not already exists
+            HeapMapValueType heapAndTimePair = {cv::makePtr<Heap<T>>(capacity), 0};
+            const std::pair<typename HeapMapType::iterator, bool>& emplaceResult = heapsPool.emplace(poolId, std::move(heapAndTimePair));
+            CV_CheckEQ(static_cast<int>(emplaceResult.second), 1, "Failed to insert the heap into its memory pool");
+            heapIt = emplaceResult.first;
+        }
+        else
+        {
+            CV_CheckEQ(heapIt->second.heapPtr.use_count(), 1, "Cannot modify a heap that is currently accessed by another caller");
+            heapIt->second.heapPtr->clear();
+            heapIt->second.heapPtr->reserve(capacity);
+            heapIt->second.iterCounter = 0;
+        }
+
+        if (iterThreshold <= 1) {
+            iterThreshold = 2 * cv::getNumThreads();
+        }
+
+        // Remove heaps that were not reused for more than given iterThreshold
+        typename HeapMapType::iterator cleanupIt = heapsPool.begin();
+        while (cleanupIt != heapsPool.end())
+        {
+            if (cleanupIt->second.iterCounter++ > iterThreshold)
+            {
+                CV_Assert(cleanupIt != heapIt);
+                cleanupIt = heapsPool.erase(cleanupIt);
+                continue;
+            }
+            ++cleanupIt;
+        }
+
+        return heapIt->second.heapPtr;
+    }
 };
 
 }
diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
index 2d39d4f0f654..60662e7714b3 100644
--- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
+++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h
@@ -532,7 +532,7 @@ class HierarchicalClusteringIndex : public NNIndex<Distance>
         const bool explore_all_trees = get_param(searchParams,"explore_all_trees",false);
 
         // Priority queue storing intermediate branches in the best-bin-first search
-        Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
+        const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
 
         std::vector<bool> checked(size_,false);
         int checks = 0;
@@ -548,8 +548,6 @@ class HierarchicalClusteringIndex : public NNIndex<Distance>
             findNN(node, result, vec, checks, maxChecks, heap, checked, false);
         }
 
-        delete heap;
-
         CV_Assert(result.full());
     }
 
@@ -742,7 +740,7 @@ class HierarchicalClusteringIndex : public NNIndex<Distance>
 
 
     void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
-                Heap<BranchSt>* heap, std::vector<bool>& checked, bool explore_all_trees = false)
+                const cv::Ptr<Heap<BranchSt>>& heap, std::vector<bool>& checked, bool explore_all_trees = false)
     {
         if (node->childs==NULL) {
             if (!explore_all_trees && (checks>=maxChecks) && result.full()) {
diff --git a/modules/flann/include/opencv2/flann/kdtree_index.h b/modules/flann/include/opencv2/flann/kdtree_index.h
index 603fdbd421a5..8245f7db796e 100644
--- a/modules/flann/include/opencv2/flann/kdtree_index.h
+++ b/modules/flann/include/opencv2/flann/kdtree_index.h
@@ -445,11 +445,12 @@ class KDTreeIndex : public NNIndex<Distance>
     {
         int i;
         BranchSt branch;
-
         int checkCount = 0;
-        Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
         DynamicBitset checked(size_);
 
+        // Priority queue storing intermediate branches in the best-bin-first search
+        const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
+
         /* Search once through each tree down to root. */
         for (i = 0; i < trees_; ++i) {
             searchLevel(result, vec, tree_roots_[i], 0, checkCount, maxCheck,
@@ -464,8 +465,6 @@ class KDTreeIndex : public NNIndex<Distance>
                         epsError, heap, checked, false);
         }
 
-        delete heap;
-
         CV_Assert(result.full());
     }
 
@@ -476,7 +475,7 @@ class KDTreeIndex : public NNIndex<Distance>
      *  at least "mindistsq".
      */
     void searchLevel(ResultSet<DistanceType>& result_set, const ElementType* vec, NodePtr node, DistanceType mindist, int& checkCount, int maxCheck,
-                     float epsError, Heap<BranchSt>* heap, DynamicBitset& checked, bool explore_all_trees = false)
+                     float epsError, const cv::Ptr<Heap<BranchSt>>& heap, DynamicBitset& checked, bool explore_all_trees = false)
     {
         if (result_set.worstDist()<mindist) {
             //			printf("Ignoring branch, too far\n");
diff --git a/modules/flann/include/opencv2/flann/kmeans_index.h b/modules/flann/include/opencv2/flann/kmeans_index.h
index f73669999f16..fd7fe2bd39f4 100644
--- a/modules/flann/include/opencv2/flann/kmeans_index.h
+++ b/modules/flann/include/opencv2/flann/kmeans_index.h
@@ -528,7 +528,7 @@ class KMeansIndex : public NNIndex<Distance>
         }
         else {
             // Priority queue storing intermediate branches in the best-bin-first search
-            Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_);
+            const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
 
             int checks = 0;
             for (int i=0; i<trees_; ++i) {
@@ -542,8 +542,6 @@ class KMeansIndex : public NNIndex<Distance>
                 KMeansNodePtr node = branch.node;
                 findNN(node, result, vec, checks, maxChecks, heap);
             }
-            delete heap;
-
             CV_Assert(result.full());
         }
     }
@@ -1529,7 +1527,7 @@ class KMeansIndex : public NNIndex<Distance>
 
 
     void findNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
-                Heap<BranchSt>* heap)
+                const cv::Ptr<Heap<BranchSt>>& heap)
     {
         // Ignore those clusters that are too far away
         {
@@ -1577,7 +1575,7 @@ class KMeansIndex : public NNIndex<Distance>
      *     distances = array with the distances to each child node.
      * Returns:
      */
-    int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, Heap<BranchSt>* heap)
+    int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, const cv::Ptr<Heap<BranchSt>>& heap)
     {
 
         int best_index = 0;
diff --git a/modules/gapi/CMakeLists.txt b/modules/gapi/CMakeLists.txt
index c5046e8be6d8..69c0aaaae817 100644
--- a/modules/gapi/CMakeLists.txt
+++ b/modules/gapi/CMakeLists.txt
@@ -163,6 +163,10 @@ set(gapi_srcs
     src/backends/ie/bindings_ie.cpp
     src/backends/python/gpythonbackend.cpp
 
+    # Streaming source
+    src/streaming/onevpl/onevpl_source.cpp
+    src/streaming/onevpl/onevpl_source_priv.cpp
+
     # Utils (ITT tracing)
     src/utils/itt.cpp
     )
@@ -234,6 +238,17 @@ if(HAVE_PLAIDML)
   ocv_target_include_directories(${the_module} SYSTEM PRIVATE ${PLAIDML_INCLUDE_DIRS})
 endif()
 
+if(HAVE_GAPI_ONEVPL)
+  if(TARGET opencv_test_gapi)
+    ocv_target_compile_definitions(opencv_test_gapi PRIVATE -DHAVE_ONEVPL)
+    ocv_target_link_libraries(opencv_test_gapi PRIVATE ${VPL_IMPORTED_TARGETS})
+  endif()
+  ocv_target_compile_definitions(${the_module} PRIVATE -DHAVE_ONEVPL)
+  ocv_target_link_libraries(${the_module} PRIVATE ${VPL_IMPORTED_TARGETS})
+  if(HAVE_D3D11 AND HAVE_OPENCL)
+    ocv_target_include_directories(${the_module} SYSTEM PRIVATE ${OPENCL_INCLUDE_DIRS})
+  endif()
+endif()
 
 if(WIN32)
   # Required for htonl/ntohl on Windows
diff --git a/modules/gapi/cmake/init.cmake b/modules/gapi/cmake/init.cmake
index 4c25c75f555c..1c464328ca1d 100644
--- a/modules/gapi/cmake/init.cmake
+++ b/modules/gapi/cmake/init.cmake
@@ -32,3 +32,10 @@ if(WITH_PLAIDML)
       set(HAVE_PLAIDML TRUE)
   endif()
 endif()
+
+if(WITH_GAPI_ONEVPL)
+    find_package(VPL)
+    if(VPL_FOUND)
+        set(HAVE_GAPI_ONEVPL TRUE)
+    endif()
+endif()
diff --git a/modules/gapi/cmake/standalone.cmake b/modules/gapi/cmake/standalone.cmake
index d08eda1be5eb..f81c1c8a85de 100644
--- a/modules/gapi/cmake/standalone.cmake
+++ b/modules/gapi/cmake/standalone.cmake
@@ -6,6 +6,13 @@ if (NOT TARGET ade )
   find_package(ade 0.1.0 REQUIRED)
 endif()
 
+if (WITH_GAPI_ONEVPL)
+    find_package(VPL)
+    if(VPL_FOUND)
+        set(HAVE_GAPI_ONEVPL TRUE)
+    endif()
+endif()
+
 set(FLUID_TARGET fluid)
 set(FLUID_ROOT "${CMAKE_CURRENT_LIST_DIR}/../")
 
diff --git a/modules/gapi/include/opencv2/gapi.hpp b/modules/gapi/include/opencv2/gapi.hpp
index e4b20214796a..f10dfd471dbf 100644
--- a/modules/gapi/include/opencv2/gapi.hpp
+++ b/modules/gapi/include/opencv2/gapi.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_HPP
@@ -19,6 +19,7 @@
     @}
     @defgroup gapi_std_backends G-API Standard Backends
     @defgroup gapi_compile_args G-API Graph Compilation Arguments
+    @defgroup gapi_serialization G-API Serialization functionality
 @}
  */
 
diff --git a/modules/gapi/include/opencv2/gapi/garg.hpp b/modules/gapi/include/opencv2/gapi/garg.hpp
index 20f2233bf9c3..ee6ee81e1cc6 100644
--- a/modules/gapi/include/opencv2/gapi/garg.hpp
+++ b/modules/gapi/include/opencv2/gapi/garg.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 
 
 #ifndef OPENCV_GAPI_GARG_HPP
@@ -171,7 +171,7 @@ using GRunArgs = std::vector<GRunArg>;
  * It's an ordinary overload of addition assignment operator.
  *
  * Example of usage:
- * @snippet dynamic_graph.cpp GRunArgs usage
+ * @snippet modules/gapi/samples/dynamic_graph.cpp GRunArgs usage
  *
  */
 inline GRunArgs& operator += (GRunArgs &lhs, const GRunArgs &rhs)
@@ -223,7 +223,7 @@ using GRunArgsP = std::vector<GRunArgP>;
  * It's an ordinary overload of addition assignment operator.
  *
  * Example of usage:
- * @snippet dynamic_graph.cpp GRunArgsP usage
+ * @snippet modules/gapi/samples/dynamic_graph.cpp GRunArgsP usage
  *
  */
 inline GRunArgsP& operator += (GRunArgsP &lhs, const GRunArgsP &rhs)
@@ -235,8 +235,39 @@ inline GRunArgsP& operator += (GRunArgsP &lhs, const GRunArgsP &rhs)
 
 namespace gapi
 {
-    GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &results);
-    GAPI_EXPORTS cv::GRunArg   bind(cv::GRunArgP &out);     // FIXME: think more about it
+/**
+ * \addtogroup gapi_serialization
+ * @{
+ *
+ * @brief G-API functions and classes for serialization and deserialization.
+ */
+/** @brief Wraps deserialized output GRunArgs to GRunArgsP which can be used by GCompiled.
+ *
+ * Since it's impossible to get modifiable output arguments from deserialization
+ * it needs to be wrapped by this function.
+ *
+ * Example of usage:
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp bind after deserialization
+ *
+ * @param out_args deserialized GRunArgs.
+ * @return the same GRunArgs wrapped in GRunArgsP.
+ * @see deserialize
+ */
+GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &out_args);
+/** @brief Wraps output GRunArgsP available during graph execution to GRunArgs which can be serialized.
+ *
+ * GRunArgsP is pointer-to-value, so to be serialized they need to be binded to real values
+ * which this function does.
+ *
+ * Example of usage:
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp bind before serialization
+ *
+ * @param out output GRunArgsP available during graph execution.
+ * @return the same GRunArgsP wrapped in serializable GRunArgs.
+ * @see serialize
+ */
+GAPI_EXPORTS cv::GRunArg   bind(cv::GRunArgP &out);     // FIXME: think more about it
+/** @} */
 }
 
 template<typename... Ts> inline GRunArgs gin(const Ts&... args)
diff --git a/modules/gapi/include/opencv2/gapi/gcommon.hpp b/modules/gapi/include/opencv2/gapi/gcommon.hpp
index a9cb0159014e..d3c280816ff9 100644
--- a/modules/gapi/include/opencv2/gapi/gcommon.hpp
+++ b/modules/gapi/include/opencv2/gapi/gcommon.hpp
@@ -44,6 +44,7 @@ namespace detail
         CV_UNKNOWN,    // Unknown, generic, opaque-to-GAPI data type unsupported in graph seriallization
         CV_BOOL,       // bool user G-API data
         CV_INT,        // int user G-API data
+        CV_INT64,      // int64_t user G-API data
         CV_DOUBLE,     // double user G-API data
         CV_FLOAT,      // float user G-API data
         CV_UINT64,     // uint64_t user G-API data
@@ -61,6 +62,7 @@ namespace detail
     template<typename T> struct GOpaqueTraits;
     template<typename T> struct GOpaqueTraits    { static constexpr const OpaqueKind kind = OpaqueKind::CV_UNKNOWN; };
     template<> struct GOpaqueTraits<int>         { static constexpr const OpaqueKind kind = OpaqueKind::CV_INT; };
+    template<> struct GOpaqueTraits<int64_t>     { static constexpr const OpaqueKind kind = OpaqueKind::CV_INT64; };
     template<> struct GOpaqueTraits<double>      { static constexpr const OpaqueKind kind = OpaqueKind::CV_DOUBLE; };
     template<> struct GOpaqueTraits<float>       { static constexpr const OpaqueKind kind = OpaqueKind::CV_FLOAT; };
     template<> struct GOpaqueTraits<uint64_t>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_UINT64; };
diff --git a/modules/gapi/include/opencv2/gapi/gproto.hpp b/modules/gapi/include/opencv2/gapi/gproto.hpp
index fbcccb38ea71..6271e470b076 100644
--- a/modules/gapi/include/opencv2/gapi/gproto.hpp
+++ b/modules/gapi/include/opencv2/gapi/gproto.hpp
@@ -71,7 +71,7 @@ struct GIOProtoArgs
      * It's an ordinary overload of addition assignment operator.
      *
      * Example of usage:
-     * @snippet dynamic_graph.cpp  GIOProtoArgs usage
+     * @snippet modules/gapi/samples/dynamic_graph.cpp  GIOProtoArgs usage
      *
      */
     template<typename Tg>
diff --git a/modules/gapi/include/opencv2/gapi/gstreaming.hpp b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
index 5bbed5e12dda..50abe69f87b7 100644
--- a/modules/gapi/include/opencv2/gapi/gstreaming.hpp
+++ b/modules/gapi/include/opencv2/gapi/gstreaming.hpp
@@ -71,6 +71,15 @@ using GOptRunArgP = util::variant<
 >;
 using GOptRunArgsP = std::vector<GOptRunArgP>;
 
+using GOptRunArg = util::variant<
+    optional<cv::Mat>,
+    optional<cv::RMat>,
+    optional<cv::Scalar>,
+    optional<cv::detail::VectorRef>,
+    optional<cv::detail::OpaqueRef>
+>;
+using GOptRunArgs = std::vector<GOptRunArg>;
+
 namespace detail {
 
 template<typename T> inline GOptRunArgP wrap_opt_arg(optional<T>& arg) {
@@ -196,7 +205,7 @@ class GAPI_EXPORTS_W_SIMPLE GStreamingCompiled
      * @param s a shared pointer to IStreamSource representing the
      * input video stream.
      */
-    GAPI_WRAP void setSource(const gapi::wip::IStreamSource::Ptr& s);
+    void setSource(const gapi::wip::IStreamSource::Ptr& s);
 
     /**
      * @brief Constructs and specifies an input video stream for a
@@ -255,7 +264,7 @@ class GAPI_EXPORTS_W_SIMPLE GStreamingCompiled
 
     // NB: Used from python
     /// @private -- Exclude this function from OpenCV documentation
-    GAPI_WRAP std::tuple<bool, cv::GRunArgs> pull();
+    GAPI_WRAP std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> pull();
 
     /**
      * @brief Get some next available data from the pipeline.
diff --git a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
index 0b11b18485c0..2e8dcb1aec7d 100644
--- a/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
@@ -43,19 +43,6 @@ namespace detail
         GOPAQUE,      // a cv::GOpaqueU (note - exactly GOpaqueU, not GOpaque<T>!)
     };
 
-    template<typename T>
-    constexpr const char* meta_to_string() noexcept;
-    template<>
-    constexpr const char* meta_to_string<cv::GMatDesc>() noexcept { return "GMatDesc"; }
-    template<>
-    constexpr const char* meta_to_string<cv::GScalarDesc>() noexcept { return "GScalarDesc"; }
-    template<>
-    constexpr const char* meta_to_string<cv::GArrayDesc>() noexcept { return "GArrayDesc"; }
-    template<>
-    constexpr const char* meta_to_string<cv::GOpaqueDesc>() noexcept { return "GOpaqueDesc"; }
-    template<>
-    constexpr const char* meta_to_string<cv::GFrameDesc>() noexcept { return "GFrameDesc";}
-
     // Describe G-API types (G-types) with traits.  Mostly used by
     // cv::GArg to store meta information about types passed into
     // operation arguments. Please note that cv::GComputation is
diff --git a/modules/gapi/include/opencv2/gapi/gtyped.hpp b/modules/gapi/include/opencv2/gapi/gtyped.hpp
index 27d977794454..6fe52a62e16b 100644
--- a/modules/gapi/include/opencv2/gapi/gtyped.hpp
+++ b/modules/gapi/include/opencv2/gapi/gtyped.hpp
@@ -35,7 +35,6 @@ namespace detail
     template<> struct ProtoToMeta<cv::GScalar>  { using type = cv::GScalarDesc; };
     template<typename U> struct ProtoToMeta<cv::GArray<U> >  { using type = cv::GArrayDesc; };
     template<typename U> struct ProtoToMeta<cv::GOpaque<U> > { using type = cv::GOpaqueDesc; };
-    template<> struct ProtoToMeta<cv::GFrame>  { using type = cv::GFrameDesc; };
     template<typename T> using ProtoToMetaT = typename ProtoToMeta<T>::type;
 
     //workaround for MSVC 19.0 bug
diff --git a/modules/gapi/include/opencv2/gapi/infer.hpp b/modules/gapi/include/opencv2/gapi/infer.hpp
index 93701856bbdb..807c82d31f89 100644
--- a/modules/gapi/include/opencv2/gapi/infer.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer.hpp
@@ -136,11 +136,12 @@ class GInferInputsTyped
     }
 
     template <typename U>
-    void setInput(const std::string& name, U in)
+    GInferInputsTyped<Ts...>& setInput(const std::string& name, U in)
     {
         m_priv->blobs.emplace(std::piecewise_construct,
                               std::forward_as_tuple(name),
                               std::forward_as_tuple(in));
+        return *this;
     }
 
     using StorageT = cv::util::variant<Ts...>;
@@ -654,7 +655,7 @@ namespace gapi {
 // A type-erased form of network parameters.
 // Similar to how a type-erased GKernel is represented and used.
 /// @private
-struct GAPI_EXPORTS GNetParam {
+struct GAPI_EXPORTS_W_SIMPLE GNetParam {
     std::string tag;     // FIXME: const?
     GBackend backend;    // Specifies the execution model
     util::any params;    // Backend-interpreted parameter structure
@@ -671,6 +672,7 @@ struct GAPI_EXPORTS GNetParam {
  */
 struct GAPI_EXPORTS_W_SIMPLE GNetPackage {
     GAPI_WRAP GNetPackage() = default;
+    GAPI_WRAP explicit GNetPackage(std::vector<GNetParam> nets);
     explicit GNetPackage(std::initializer_list<GNetParam> ii);
     std::vector<GBackend> backends() const;
     std::vector<GNetParam> networks;
diff --git a/modules/gapi/include/opencv2/gapi/infer/bindings_ie.hpp b/modules/gapi/include/opencv2/gapi/infer/bindings_ie.hpp
index fdd4128b1ae2..92ef2101a179 100644
--- a/modules/gapi/include/opencv2/gapi/infer/bindings_ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/bindings_ie.hpp
@@ -22,17 +22,28 @@ namespace ie {
 // This class can be marked as SIMPLE, because it's implemented as pimpl
 class GAPI_EXPORTS_W_SIMPLE PyParams {
 public:
+    GAPI_WRAP
     PyParams() = default;
 
+    GAPI_WRAP
     PyParams(const std::string &tag,
              const std::string &model,
              const std::string &weights,
              const std::string &device);
 
+    GAPI_WRAP
     PyParams(const std::string &tag,
              const std::string &model,
              const std::string &device);
 
+    GAPI_WRAP
+    PyParams& constInput(const std::string &layer_name,
+                         const cv::Mat &data,
+                         TraitAs hint = TraitAs::TENSOR);
+
+    GAPI_WRAP
+    PyParams& cfgNumRequests(size_t nireq);
+
     GBackend      backend() const;
     std::string   tag()     const;
     cv::util::any params()  const;
diff --git a/modules/gapi/include/opencv2/gapi/infer/ie.hpp b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
index 70712ba74039..2be739e51840 100644
--- a/modules/gapi/include/opencv2/gapi/infer/ie.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/ie.hpp
@@ -74,7 +74,11 @@ struct ParamDesc {
     std::map<std::string, std::vector<std::size_t>> reshape_table;
     std::unordered_set<std::string> layer_names_to_reshape;
 
+    // NB: Number of asyncrhonious infer requests
     size_t nireq;
+
+    // NB: An optional config to setup RemoteContext for IE
+    cv::util::any context_config;
 };
 } // namespace detail
 
@@ -115,7 +119,8 @@ template<typename Net> class Params {
               , {}
               , {}
               , {}
-              , 1u} {
+              , 1u
+              , {}} {
     };
 
     /** @overload
@@ -135,7 +140,8 @@ template<typename Net> class Params {
               , {}
               , {}
               , {}
-              , 1u} {
+              , 1u
+              , {}} {
     };
 
     /** @brief Specifies sequence of network input layers names for inference.
@@ -217,6 +223,30 @@ template<typename Net> class Params {
         return *this;
     }
 
+    /** @brief Specifies configuration for RemoteContext in InferenceEngine.
+
+    When RemoteContext is configured the backend imports the networks using the context.
+    It also expects cv::MediaFrames to be actually remote, to operate with blobs via the context.
+
+    @param ctx_cfg cv::util::any value which holds InferenceEngine::ParamMap.
+    @return reference to this parameter structure.
+    */
+    Params& cfgContextParams(const cv::util::any& ctx_cfg) {
+        desc.context_config = ctx_cfg;
+        return *this;
+    }
+
+    /** @overload
+    Function with an rvalue parameter.
+
+    @param ctx_cfg cv::util::any value which holds InferenceEngine::ParamMap.
+    @return reference to this parameter structure.
+    */
+    Params& cfgContextParams(cv::util::any&& ctx_cfg) {
+        desc.context_config = std::move(ctx_cfg);
+        return *this;
+    }
+
     /** @brief Specifies number of asynchronous inference requests.
 
     @param nireq Number of inference asynchronous requests.
@@ -318,7 +348,10 @@ class Params<cv::gapi::Generic> {
            const std::string &model,
            const std::string &weights,
            const std::string &device)
-        : desc{ model, weights, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u}, m_tag(tag) {
+        : desc{ model, weights, device, {}, {}, {}, 0u, 0u,
+                detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u,
+                {}},
+          m_tag(tag) {
     };
 
     /** @overload
@@ -333,7 +366,10 @@ class Params<cv::gapi::Generic> {
     Params(const std::string &tag,
            const std::string &model,
            const std::string &device)
-        : desc{ model, {}, device, {}, {}, {}, 0u, 0u, detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u}, m_tag(tag) {
+        : desc{ model, {}, device, {}, {}, {}, 0u, 0u,
+                detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u,
+                {}},
+          m_tag(tag) {
     };
 
     /** @see ie::Params::pluginConfig. */
diff --git a/modules/gapi/include/opencv2/gapi/infer/parsers.hpp b/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
index 22c8701a6c2e..c7308dd39f47 100644
--- a/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
+++ b/modules/gapi/include/opencv2/gapi/infer/parsers.hpp
@@ -64,10 +64,10 @@ detection is smaller than confidence threshold, detection is rejected.
 given label will get to the output.
 @return a tuple with a vector of detected boxes and a vector of appropriate labels.
 */
-GAPI_EXPORTS std::tuple<GArray<Rect>, GArray<int>> parseSSD(const GMat& in,
-                                                            const GOpaque<Size>& inSz,
-                                                            const float confidenceThreshold = 0.5f,
-                                                            const int   filterLabel = -1);
+GAPI_EXPORTS_W std::tuple<GArray<Rect>, GArray<int>> parseSSD(const GMat& in,
+                                                              const GOpaque<Size>& inSz,
+                                                              const float confidenceThreshold = 0.5f,
+                                                              const int   filterLabel = -1);
 
 /** @brief Parses output of SSD network.
 
@@ -113,12 +113,12 @@ If 1.f, nms is not performed and no boxes are rejected.
 <a href="https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/yolo-v2-tiny-tf/yolo-v2-tiny-tf.md">documentation</a>.
 @return a tuple with a vector of detected boxes and a vector of appropriate labels.
 */
-GAPI_EXPORTS std::tuple<GArray<Rect>, GArray<int>> parseYolo(const GMat& in,
-                                                             const GOpaque<Size>& inSz,
-                                                             const float confidenceThreshold = 0.5f,
-                                                             const float nmsThreshold = 0.5f,
-                                                             const std::vector<float>& anchors
-                                                                 = nn::parsers::GParseYolo::defaultAnchors());
+GAPI_EXPORTS_W std::tuple<GArray<Rect>, GArray<int>> parseYolo(const GMat& in,
+                                                               const GOpaque<Size>& inSz,
+                                                               const float confidenceThreshold = 0.5f,
+                                                               const float nmsThreshold = 0.5f,
+                                                               const std::vector<float>& anchors
+                                                                   = nn::parsers::GParseYolo::defaultAnchors());
 
 } // namespace gapi
 } // namespace cv
diff --git a/modules/gapi/include/opencv2/gapi/media.hpp b/modules/gapi/include/opencv2/gapi/media.hpp
index aa7d6d6a1f4e..19aaef3fd1a5 100644
--- a/modules/gapi/include/opencv2/gapi/media.hpp
+++ b/modules/gapi/include/opencv2/gapi/media.hpp
@@ -15,6 +15,16 @@
 #include <opencv2/gapi/gframe.hpp>
 #include <opencv2/gapi/util/any.hpp>
 
+// Forward declaration
+namespace cv {
+namespace gapi {
+namespace s11n {
+struct IOStream;
+struct IIStream;
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
 namespace cv {
 
 /** \addtogroup gapi_data_structures
@@ -125,6 +135,16 @@ class GAPI_EXPORTS MediaFrame {
         return dynamic_cast<T*>(adapter);
     }
 
+    /**
+     * @brief Serialize MediaFrame's data to a byte array.
+     *
+     * @note The actual logic is implemented by frame's adapter class.
+     * Does nothing by default.
+     *
+     * @param os Bytestream to store serialized MediaFrame data in.
+     */
+    void serialize(cv::gapi::s11n::IOStream& os) const;
+
 private:
     struct Priv;
     std::shared_ptr<Priv> m;
@@ -221,6 +241,14 @@ class GAPI_EXPORTS MediaFrame::IAdapter {
     // FIXME: design a better solution
     // The default implementation does nothing
     virtual cv::util::any blobParams() const;
+    virtual void serialize(cv::gapi::s11n::IOStream&) {
+        GAPI_Assert(false && "Generic serialize method of MediaFrame::IAdapter does nothing by default. "
+                             "Please, implement it in derived class to properly serialize the object.");
+    }
+    virtual void deserialize(cv::gapi::s11n::IIStream&) {
+        GAPI_Assert(false && "Generic deserialize method of MediaFrame::IAdapter does nothing by default. "
+                             "Please, implement it in derived class to properly deserialize the object.");
+    }
 };
 /** @} */
 
diff --git a/modules/gapi/include/opencv2/gapi/own/exports.hpp b/modules/gapi/include/opencv2/gapi/own/exports.hpp
index 1978991b7518..c36f4003d0fb 100644
--- a/modules/gapi/include/opencv2/gapi/own/exports.hpp
+++ b/modules/gapi/include/opencv2/gapi/own/exports.hpp
@@ -13,11 +13,13 @@
 #       define GAPI_EXPORTS CV_EXPORTS
         /* special informative macros for wrapper generators */
 #       define GAPI_PROP CV_PROP
+#       define GAPI_PROP_RW CV_PROP_RW
 #       define GAPI_WRAP CV_WRAP
 #       define GAPI_EXPORTS_W_SIMPLE CV_EXPORTS_W_SIMPLE
 #       define GAPI_EXPORTS_W CV_EXPORTS_W
 #   else
 #       define GAPI_PROP
+#       define GAPI_PROP_RW
 #       define GAPI_WRAP
 #       define GAPI_EXPORTS
 #       define GAPI_EXPORTS_W_SIMPLE
diff --git a/modules/gapi/include/opencv2/gapi/render/render.hpp b/modules/gapi/include/opencv2/gapi/render/render.hpp
index 6bfe92388a12..537541222414 100644
--- a/modules/gapi/include/opencv2/gapi/render/render.hpp
+++ b/modules/gapi/include/opencv2/gapi/render/render.hpp
@@ -81,9 +81,9 @@ using GMatDesc2 = std::tuple<cv::GMatDesc,cv::GMatDesc>;
 @param prims vector of drawing primitivies
 @param args graph compile time parameters
 */
-void GAPI_EXPORTS render(cv::Mat& bgr,
-                         const Prims& prims,
-                         cv::GCompileArgs&& args = {});
+void GAPI_EXPORTS_W render(cv::Mat& bgr,
+                           const Prims& prims,
+                           cv::GCompileArgs&& args = {});
 
 /** @brief The function renders on two NV12 planes passed drawing primitivies
 
@@ -92,10 +92,10 @@ void GAPI_EXPORTS render(cv::Mat& bgr,
 @param prims vector of drawing primitivies
 @param args graph compile time parameters
 */
-void GAPI_EXPORTS render(cv::Mat& y_plane,
-                         cv::Mat& uv_plane,
-                         const Prims& prims,
-                         cv::GCompileArgs&& args = {});
+void GAPI_EXPORTS_W render(cv::Mat& y_plane,
+                           cv::Mat& uv_plane,
+                           const Prims& prims,
+                           cv::GCompileArgs&& args = {});
 
 /** @brief The function renders on the input media frame passed drawing primitivies
 
@@ -139,7 +139,7 @@ Output image must be 8-bit unsigned planar 3-channel image
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3
 @param prims draw primitives
 */
-GAPI_EXPORTS GMat render3ch(const GMat& src, const GArray<Prim>& prims);
+GAPI_EXPORTS_W GMat render3ch(const GMat& src, const GArray<Prim>& prims);
 
 /** @brief Renders on two planes
 
@@ -150,9 +150,9 @@ uv image must be 8-bit unsigned planar 2-channel image @ref CV_8UC2
 @param uv input image: 8-bit unsigned 2-channel image @ref CV_8UC2
 @param prims draw primitives
 */
-GAPI_EXPORTS GMat2 renderNV12(const GMat& y,
-                              const GMat& uv,
-                              const GArray<Prim>& prims);
+GAPI_EXPORTS_W GMat2 renderNV12(const GMat& y,
+                                const GMat& uv,
+                                const GArray<Prim>& prims);
 
 /** @brief Renders Media Frame
 
@@ -177,7 +177,7 @@ namespace render
 {
 namespace ocv
 {
-    GAPI_EXPORTS cv::gapi::GKernelPackage kernels();
+    GAPI_EXPORTS_W cv::gapi::GKernelPackage kernels();
 
 } // namespace ocv
 } // namespace render
diff --git a/modules/gapi/include/opencv2/gapi/render/render_types.hpp b/modules/gapi/include/opencv2/gapi/render/render_types.hpp
index ca403be361ee..6d70e3a877dd 100644
--- a/modules/gapi/include/opencv2/gapi/render/render_types.hpp
+++ b/modules/gapi/include/opencv2/gapi/render/render_types.hpp
@@ -41,7 +41,7 @@ struct freetype_font
  *
  * Parameters match cv::putText().
  */
-struct Text
+struct GAPI_EXPORTS_W_SIMPLE Text
 {
     /**
      * @brief Text constructor
@@ -55,6 +55,7 @@ struct Text
      * @param lt_                 The line type. See #LineTypes
      * @param bottom_left_origin_ When true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner
      */
+    GAPI_WRAP
     Text(const std::string& text_,
          const cv::Point& org_,
          int ff_,
@@ -68,17 +69,18 @@ struct Text
     {
     }
 
+    GAPI_WRAP
     Text() = default;
 
     /*@{*/
-    std::string text;               //!< The text string to be drawn
-    cv::Point   org;                //!< The bottom-left corner of the text string in the image
-    int         ff;                 //!< The font type, see #HersheyFonts
-    double      fs;                 //!< The font scale factor that is multiplied by the font-specific base size
-    cv::Scalar  color;              //!< The text color
-    int         thick;              //!< The thickness of the lines used to draw a text
-    int         lt;                 //!< The line type. See #LineTypes
-    bool        bottom_left_origin; //!< When true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner
+    GAPI_PROP_RW std::string text;               //!< The text string to be drawn
+    GAPI_PROP_RW cv::Point   org;                //!< The bottom-left corner of the text string in the image
+    GAPI_PROP_RW int         ff;                 //!< The font type, see #HersheyFonts
+    GAPI_PROP_RW double      fs;                 //!< The font scale factor that is multiplied by the font-specific base size
+    GAPI_PROP_RW cv::Scalar  color;              //!< The text color
+    GAPI_PROP_RW int         thick;              //!< The thickness of the lines used to draw a text
+    GAPI_PROP_RW int         lt;                 //!< The line type. See #LineTypes
+    GAPI_PROP_RW bool        bottom_left_origin; //!< When true, the image data origin is at the bottom-left corner. Otherwise, it is at the top-left corner
     /*@{*/
 };
 
@@ -122,7 +124,7 @@ struct FText
  *
  * Parameters match cv::rectangle().
  */
-struct Rect
+struct GAPI_EXPORTS_W_SIMPLE Rect
 {
     /**
      * @brief Rect constructor
@@ -142,14 +144,15 @@ struct Rect
     {
     }
 
+    GAPI_WRAP
     Rect() = default;
 
     /*@{*/
-    cv::Rect   rect;  //!< Coordinates of the rectangle
-    cv::Scalar color; //!< The rectangle color or brightness (grayscale image)
-    int        thick; //!< The thickness of lines that make up the rectangle. Negative values, like #FILLED, mean that the function has to draw a filled rectangle
-    int        lt;    //!< The type of the line. See #LineTypes
-    int        shift; //!< The number of fractional bits in the point coordinates
+    GAPI_PROP_RW cv::Rect   rect;  //!< Coordinates of the rectangle
+    GAPI_PROP_RW cv::Scalar color; //!< The rectangle color or brightness (grayscale image)
+    GAPI_PROP_RW int        thick; //!< The thickness of lines that make up the rectangle. Negative values, like #FILLED, mean that the function has to draw a filled rectangle
+    GAPI_PROP_RW int        lt;    //!< The type of the line. See #LineTypes
+    GAPI_PROP_RW int        shift; //!< The number of fractional bits in the point coordinates
     /*@{*/
 };
 
@@ -158,7 +161,7 @@ struct Rect
  *
  * Parameters match cv::circle().
  */
-struct Circle
+struct GAPI_EXPORTS_W_SIMPLE Circle
 {
     /**
      * @brief Circle constructor
@@ -170,6 +173,7 @@ struct Circle
      * @param  lt_     The Type of the circle boundary. See #LineTypes
      * @param  shift_  The Number of fractional bits in the coordinates of the center and in the radius value
      */
+    GAPI_WRAP
     Circle(const cv::Point& center_,
            int radius_,
            const cv::Scalar& color_,
@@ -180,15 +184,16 @@ struct Circle
     {
     }
 
+    GAPI_WRAP
     Circle() = default;
 
     /*@{*/
-    cv::Point  center; //!< The center of the circle
-    int        radius; //!< The radius of the circle
-    cv::Scalar color;  //!< The color of the  circle
-    int        thick;  //!< The thickness of the circle outline, if positive. Negative values, like #FILLED, mean that a filled circle is to be drawn
-    int        lt;     //!< The Type of the circle boundary. See #LineTypes
-    int        shift;  //!< The Number of fractional bits in the coordinates of the center and in the radius value
+    GAPI_PROP_RW cv::Point  center; //!< The center of the circle
+    GAPI_PROP_RW int        radius; //!< The radius of the circle
+    GAPI_PROP_RW cv::Scalar color;  //!< The color of the  circle
+    GAPI_PROP_RW int        thick;  //!< The thickness of the circle outline, if positive. Negative values, like #FILLED, mean that a filled circle is to be drawn
+    GAPI_PROP_RW int        lt;     //!< The Type of the circle boundary. See #LineTypes
+    GAPI_PROP_RW int        shift;  //!< The Number of fractional bits in the coordinates of the center and in the radius value
     /*@{*/
 };
 
@@ -197,7 +202,7 @@ struct Circle
  *
  * Parameters match cv::line().
  */
-struct Line
+struct GAPI_EXPORTS_W_SIMPLE Line
 {
     /**
      * @brief Line constructor
@@ -209,6 +214,7 @@ struct Line
      * @param  lt_     The Type of the line. See #LineTypes
      * @param  shift_  The number of fractional bits in the point coordinates
     */
+    GAPI_WRAP
     Line(const cv::Point& pt1_,
          const cv::Point& pt2_,
          const cv::Scalar& color_,
@@ -219,15 +225,16 @@ struct Line
     {
     }
 
+    GAPI_WRAP
     Line() = default;
 
     /*@{*/
-    cv::Point  pt1;    //!< The first point of the line segment
-    cv::Point  pt2;    //!< The second point of the line segment
-    cv::Scalar color;  //!< The line color
-    int        thick;  //!< The thickness of line
-    int        lt;     //!< The Type of the line. See #LineTypes
-    int        shift;  //!< The number of fractional bits in the point coordinates
+    GAPI_PROP_RW cv::Point  pt1;    //!< The first point of the line segment
+    GAPI_PROP_RW cv::Point  pt2;    //!< The second point of the line segment
+    GAPI_PROP_RW cv::Scalar color;  //!< The line color
+    GAPI_PROP_RW int        thick;  //!< The thickness of line
+    GAPI_PROP_RW int        lt;     //!< The Type of the line. See #LineTypes
+    GAPI_PROP_RW int        shift;  //!< The number of fractional bits in the point coordinates
     /*@{*/
 };
 
@@ -236,7 +243,7 @@ struct Line
  *
  * Mosaicing is a very basic method to obfuscate regions in the image.
  */
-struct Mosaic
+struct GAPI_EXPORTS_W_SIMPLE Mosaic
 {
     /**
      * @brief Mosaic constructor
@@ -252,12 +259,13 @@ struct Mosaic
     {
     }
 
+    GAPI_WRAP
     Mosaic() : cellSz(0), decim(0) {}
 
     /*@{*/
-    cv::Rect   mos;    //!< Coordinates of the mosaic
-    int        cellSz; //!< Cell size (same for X, Y)
-    int        decim;  //!< Decimation (0 stands for no decimation)
+    GAPI_PROP_RW cv::Rect mos;    //!< Coordinates of the mosaic
+    GAPI_PROP_RW int      cellSz; //!< Cell size (same for X, Y)
+    GAPI_PROP_RW int      decim;  //!< Decimation (0 stands for no decimation)
     /*@{*/
 };
 
@@ -266,7 +274,7 @@ struct Mosaic
  *
  * Image is blended on a frame using the specified mask.
  */
-struct Image
+struct GAPI_EXPORTS_W_SIMPLE Image
 {
     /**
      * @brief Mosaic constructor
@@ -275,6 +283,7 @@ struct Image
      * @param  img_   Image to draw
      * @param  alpha_ Alpha channel for image to draw (same size and number of channels)
     */
+    GAPI_WRAP
     Image(const cv::Point& org_,
           const cv::Mat& img_,
           const cv::Mat& alpha_) :
@@ -282,19 +291,20 @@ struct Image
     {
     }
 
+    GAPI_WRAP
     Image() = default;
 
     /*@{*/
-    cv::Point org;   //!< The bottom-left corner of the image
-    cv::Mat   img;   //!< Image to draw
-    cv::Mat   alpha; //!< Alpha channel for image to draw (same size and number of channels)
+    GAPI_PROP_RW cv::Point org;   //!< The bottom-left corner of the image
+    GAPI_PROP_RW cv::Mat   img;   //!< Image to draw
+    GAPI_PROP_RW cv::Mat   alpha; //!< Alpha channel for image to draw (same size and number of channels)
     /*@{*/
 };
 
 /**
  * @brief This structure represents a polygon to draw.
  */
-struct Poly
+struct GAPI_EXPORTS_W_SIMPLE Poly
 {
     /**
      * @brief Mosaic constructor
@@ -305,6 +315,7 @@ struct Poly
      * @param lt_     The Type of the line. See #LineTypes
      * @param shift_  The number of fractional bits in the point coordinate
     */
+    GAPI_WRAP
     Poly(const std::vector<cv::Point>& points_,
          const cv::Scalar& color_,
          int thick_ = 1,
@@ -314,14 +325,15 @@ struct Poly
     {
     }
 
+    GAPI_WRAP
     Poly() = default;
 
     /*@{*/
-    std::vector<cv::Point> points;  //!< Points to connect
-    cv::Scalar             color;   //!< The line color
-    int                    thick;   //!< The thickness of line
-    int                    lt;      //!< The Type of the line. See #LineTypes
-    int                    shift;   //!< The number of fractional bits in the point coordinate
+    GAPI_PROP_RW std::vector<cv::Point> points;  //!< Points to connect
+    GAPI_PROP_RW cv::Scalar             color;   //!< The line color
+    GAPI_PROP_RW int                    thick;   //!< The thickness of line
+    GAPI_PROP_RW int                    lt;      //!< The Type of the line. See #LineTypes
+    GAPI_PROP_RW int                    shift;   //!< The number of fractional bits in the point coordinate
     /*@{*/
 };
 
@@ -336,7 +348,7 @@ using Prim  = util::variant
     , Poly
     >;
 
-using Prims     = std::vector<Prim>;
+using Prims = std::vector<Prim>;
 //! @} gapi_draw_prims
 
 } // namespace draw
diff --git a/modules/gapi/include/opencv2/gapi/rmat.hpp b/modules/gapi/include/opencv2/gapi/rmat.hpp
index cc27f48664cd..6b289001e7f3 100644
--- a/modules/gapi/include/opencv2/gapi/rmat.hpp
+++ b/modules/gapi/include/opencv2/gapi/rmat.hpp
@@ -14,8 +14,8 @@
 namespace cv {
 namespace gapi {
 namespace s11n {
-    struct IOStream;
-    struct IIStream;
+struct IOStream;
+struct IIStream;
 } // namespace s11n
 } // namespace gapi
 } // namespace cv
@@ -111,10 +111,12 @@ class GAPI_EXPORTS RMat
         // is transferred to the device when the view is destroyed
         virtual View access(Access) = 0;
         virtual void serialize(cv::gapi::s11n::IOStream&) {
-            GAPI_Assert(false && "Generic serialize method should never be called for RMat adapter");
+            GAPI_Assert(false && "Generic serialize method of RMat::Adapter does nothing by default. "
+                                 "Please, implement it in derived class to properly serialize the object.");
         }
         virtual void deserialize(cv::gapi::s11n::IIStream&) {
-            GAPI_Assert(false && "Generic deserialize method should never be called for RMat adapter");
+            GAPI_Assert(false && "Generic deserialize method of RMat::Adapter does nothing by default. "
+                                 "Please, implement it in derived class to properly deserialize the object.");
         }
     };
     using AdapterP = std::shared_ptr<Adapter>;
diff --git a/modules/gapi/include/opencv2/gapi/s11n.hpp b/modules/gapi/include/opencv2/gapi/s11n.hpp
index 5a64410e5abe..53800970d1cb 100644
--- a/modules/gapi/include/opencv2/gapi/s11n.hpp
+++ b/modules/gapi/include/opencv2/gapi/s11n.hpp
@@ -13,69 +13,145 @@
 #include <opencv2/gapi/s11n/base.hpp>
 #include <opencv2/gapi/gcomputation.hpp>
 #include <opencv2/gapi/rmat.hpp>
+#include <opencv2/gapi/media.hpp>
+#include <opencv2/gapi/util/util.hpp>
+
+// FIXME: caused by deserialize_runarg
+#if (defined _WIN32 || defined _WIN64) && defined _MSC_VER
+#pragma warning(disable: 4702)
+#endif
 
 namespace cv {
 namespace gapi {
 
+/**
+* \addtogroup gapi_serialization
+* @{
+*/
+
 namespace detail {
-    GAPI_EXPORTS cv::GComputation getGraph(const std::vector<char> &p);
+    GAPI_EXPORTS cv::GComputation getGraph(const std::vector<char> &bytes);
 
-    GAPI_EXPORTS cv::GMetaArgs getMetaArgs(const std::vector<char> &p);
+    GAPI_EXPORTS cv::GMetaArgs getMetaArgs(const std::vector<char> &bytes);
 
-    GAPI_EXPORTS cv::GRunArgs getRunArgs(const std::vector<char> &p);
+    GAPI_EXPORTS cv::GRunArgs getRunArgs(const std::vector<char> &bytes);
 
-    GAPI_EXPORTS std::vector<std::string> getVectorOfStrings(const std::vector<char> &p);
+    GAPI_EXPORTS std::vector<std::string> getVectorOfStrings(const std::vector<char> &bytes);
 
     template<typename... Types>
-    cv::GCompileArgs getCompileArgs(const std::vector<char> &p);
+    cv::GCompileArgs getCompileArgs(const std::vector<char> &bytes);
 
-    template<typename RMatAdapterType>
-    cv::GRunArgs getRunArgsWithRMats(const std::vector<char> &p);
+    template<typename... AdapterType>
+    cv::GRunArgs getRunArgsWithAdapters(const std::vector<char> &bytes);
 } // namespace detail
 
+/** @brief Serialize a graph represented by GComputation into an array of bytes.
+ *
+ * Check different overloads for more examples.
+ * @param c GComputation to serialize.
+ * @return serialized vector of bytes.
+ */
 GAPI_EXPORTS std::vector<char> serialize(const cv::GComputation &c);
-//namespace{
 
+/** @overload
+ * @param ca GCompileArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GCompileArgs& ca);
+
+/** @overload
+ * @param ma GMetaArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GMetaArgs& ma);
+
+/** @overload
+ * @param ra GRunArgs to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const cv::GRunArgs& ra);
+
+/** @overload
+ * @param vs std::vector<std::string> to serialize.
+ */
+GAPI_EXPORTS std::vector<char> serialize(const std::vector<std::string>& vs);
+
+/**
+ * @private
+ */
 template<typename T> static inline
-T deserialize(const std::vector<char> &p);
-
-//} //ananymous namespace
-
-GAPI_EXPORTS std::vector<char> serialize(const cv::GCompileArgs&);
-GAPI_EXPORTS std::vector<char> serialize(const cv::GMetaArgs&);
-GAPI_EXPORTS std::vector<char> serialize(const cv::GRunArgs&);
-GAPI_EXPORTS std::vector<char> serialize(const std::vector<std::string>&);
-
+T deserialize(const std::vector<char> &bytes);
+
+/** @brief Deserialize GComputation from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GComputation object.
+ */
 template<> inline
-cv::GComputation deserialize(const std::vector<char> &p) {
-    return detail::getGraph(p);
+cv::GComputation deserialize(const std::vector<char> &bytes) {
+    return detail::getGraph(bytes);
 }
 
+/** @brief Deserialize GMetaArgs from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GMetaArgs object.
+ */
 template<> inline
-cv::GMetaArgs deserialize(const std::vector<char> &p) {
-    return detail::getMetaArgs(p);
+cv::GMetaArgs deserialize(const std::vector<char> &bytes) {
+    return detail::getMetaArgs(bytes);
 }
 
+/** @brief Deserialize GRunArgs from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized GRunArgs object.
+ */
 template<> inline
-cv::GRunArgs deserialize(const std::vector<char> &p) {
-    return detail::getRunArgs(p);
+cv::GRunArgs deserialize(const std::vector<char> &bytes) {
+    return detail::getRunArgs(bytes);
 }
 
+/** @brief Deserialize std::vector<std::string> from a byte array.
+ *
+ * Check different overloads for more examples.
+ * @param bytes serialized vector of bytes.
+ * @return deserialized std::vector<std::string> object.
+ */
 template<> inline
-std::vector<std::string> deserialize(const std::vector<char> &p) {
-    return detail::getVectorOfStrings(p);
+std::vector<std::string> deserialize(const std::vector<char> &bytes) {
+    return detail::getVectorOfStrings(bytes);
 }
 
+/**
+ * @brief Deserialize GCompileArgs which types were specified in the template from a byte array.
+ *
+ * @note cv::gapi::s11n::detail::S11N template specialization must be provided to make a custom type
+ * in GCompileArgs deserializable.
+ *
+ * @param bytes vector of bytes to deserialize GCompileArgs object from.
+ * @return GCompileArgs object.
+ * @see GCompileArgs cv::gapi::s11n::detail::S11N
+ */
 template<typename T, typename... Types> inline
 typename std::enable_if<std::is_same<T, GCompileArgs>::value, GCompileArgs>::
-type deserialize(const std::vector<char> &p) {
-    return detail::getCompileArgs<Types...>(p);
+type deserialize(const std::vector<char> &bytes) {
+    return detail::getCompileArgs<Types...>(bytes);
 }
 
-template<typename T, typename RMatAdapterType> inline
+/**
+ * @brief Deserialize GRunArgs including RMat and MediaFrame objects if any from a byte array.
+ *
+ * Adapter types are specified in the template.
+ * @note To be used properly specified adapter types must overload their deserialize() method.
+ * @param bytes vector of bytes to deserialize GRunArgs object from.
+ * @return GRunArgs including RMat and MediaFrame objects if any.
+ * @see RMat MediaFrame
+ */
+template<typename T, typename AtLeastOneAdapterT, typename... AdapterTypes> inline
 typename std::enable_if<std::is_same<T, GRunArgs>::value, GRunArgs>::
-type deserialize(const std::vector<char> &p) {
-    return detail::getRunArgsWithRMats<RMatAdapterType>(p);
+type deserialize(const std::vector<char> &bytes) {
+    return detail::getRunArgsWithAdapters<AtLeastOneAdapterT, AdapterTypes...>(bytes);
 }
 } // namespace gapi
 } // namespace cv
@@ -83,6 +159,17 @@ type deserialize(const std::vector<char> &p) {
 namespace cv {
 namespace gapi {
 namespace s11n {
+
+/** @brief This structure is an interface for serialization routines.
+ *
+ * It's main purpose is to provide multiple overloads for operator<<()
+ * with basic C++ in addition to OpenCV/G-API types.
+ *
+ * This sctructure can be inherited and further extended with additional types.
+ *
+ * For example, it is utilized in cv::gapi::s11n::detail::S11N as input parameter
+ * in serialize() method.
+ */
 struct GAPI_EXPORTS IOStream {
     virtual ~IOStream() = default;
     // Define the native support for basic C++ types at the API level:
@@ -99,6 +186,16 @@ struct GAPI_EXPORTS IOStream {
     virtual IOStream& operator<< (const std::string&) = 0;
 };
 
+/** @brief This structure is an interface for deserialization routines.
+ *
+ * It's main purpose is to provide multiple overloads for operator>>()
+ * with basic C++ in addition to OpenCV/G-API types.
+ *
+ * This structure can be inherited and further extended with additional types.
+ *
+ * For example, it is utilized in cv::gapi::s11n::detail::S11N as input parameter
+ * in deserialize() method.
+ */
 struct GAPI_EXPORTS IIStream {
     virtual ~IIStream() = default;
     virtual IIStream& operator>> (bool &) = 0;
@@ -116,7 +213,7 @@ struct GAPI_EXPORTS IIStream {
 };
 
 namespace detail {
-GAPI_EXPORTS std::unique_ptr<IIStream> getInStream(const std::vector<char> &p);
+GAPI_EXPORTS std::unique_ptr<IIStream> getInStream(const std::vector<char> &bytes);
 } // namespace detail
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -146,24 +243,26 @@ GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Mat &m);
 
 // FIXME: for GRunArgs serailization
 #if !defined(GAPI_STANDALONE)
-GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::UMat &);
-GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::UMat &);
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::UMat & um);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::UMat & um);
 #endif // !defined(GAPI_STANDALONE)
 
 GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::RMat &r);
 GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::RMat &r);
 
-GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gapi::wip::IStreamSource::Ptr &);
-GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::gapi::wip::IStreamSource::Ptr &);
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gapi::wip::IStreamSource::Ptr &issptr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::gapi::wip::IStreamSource::Ptr &issptr);
 
-GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::VectorRef &);
-GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::VectorRef &);
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::VectorRef &vr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::VectorRef &vr);
 
-GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::OpaqueRef &);
-GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::OpaqueRef &);
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::detail::OpaqueRef &opr);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::detail::OpaqueRef &opr);
 
-GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::MediaFrame &);
-GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::MediaFrame &);
+/// @private -- Exclude this function from OpenCV documentation
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::MediaFrame &mf);
+/// @private -- Exclude this function from OpenCV documentation
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::MediaFrame &mf);
 
 // Generic STL types ////////////////////////////////////////////////////////////////
 template<typename K, typename V>
@@ -186,6 +285,7 @@ IIStream& operator>> (IIStream& is, std::map<K, V> &m) {
     }
     return is;
 }
+
 template<typename K, typename V>
 IOStream& operator<< (IOStream& os, const std::unordered_map<K, V> &m) {
     const uint32_t sz = static_cast<uint32_t>(m.size());
@@ -206,6 +306,7 @@ IIStream& operator>> (IIStream& is, std::unordered_map<K, V> &m) {
     }
     return is;
 }
+
 template<typename T>
 IOStream& operator<< (IOStream& os, const std::vector<T> &ts) {
     const uint32_t sz = static_cast<uint32_t>(ts.size());
@@ -233,16 +334,19 @@ template<typename V>
 IOStream& put_v(IOStream&, const V&, std::size_t) {
     GAPI_Assert(false && "variant>>: requested index is invalid");
 };
+
 template<typename V, typename X, typename... Xs>
 IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
     return (x == 0u)
         ? os << cv::util::get<X>(v)
         : put_v<V, Xs...>(os, v, x-1);
 }
+
 template<typename V>
 IIStream& get_v(IIStream&, V&, std::size_t, std::size_t) {
     GAPI_Assert(false && "variant<<: requested index is invalid");
 }
+
 template<typename V, typename X, typename... Xs>
 IIStream& get_v(IIStream& is, V& v, std::size_t i, std::size_t gi) {
     if (i == gi) {
@@ -254,11 +358,13 @@ IIStream& get_v(IIStream& is, V& v, std::size_t i, std::size_t gi) {
 }
 } // namespace detail
 
+//! @overload
 template<typename... Ts>
 IOStream& operator<< (IOStream& os, const cv::util::variant<Ts...> &v) {
     os << static_cast<uint32_t>(v.index());
     return detail::put_v<cv::util::variant<Ts...>, Ts...>(os, v, v.index());
 }
+//! @overload
 template<typename... Ts>
 IIStream& operator>> (IIStream& is, cv::util::variant<Ts...> &v) {
     int idx = -1;
@@ -268,6 +374,7 @@ IIStream& operator>> (IIStream& is, cv::util::variant<Ts...> &v) {
 }
 
 // FIXME: consider a better solution
+/// @private -- Exclude this function from OpenCV documentation
 template<typename... Ts>
 void getRunArgByIdx (IIStream& is, cv::util::variant<Ts...> &v, uint32_t idx) {
     is = detail::get_v<cv::util::variant<Ts...>, Ts...>(is, v, 0u, idx);
@@ -298,16 +405,39 @@ static cv::util::optional<GCompileArg> exec(const std::string& tag, cv::gapi::s1
 }
 };
 
-template<typename T> struct deserialize_runarg;
+template<typename ...T>
+struct deserialize_arg_with_adapter;
 
-template<typename RMatAdapterType>
+template<typename RA, typename TA>
+struct deserialize_arg_with_adapter<RA, TA> {
+static GRunArg exec(cv::gapi::s11n::IIStream& is) {
+    std::unique_ptr<TA> ptr(new TA);
+    ptr->deserialize(is);
+    return GRunArg { RA(std::move(ptr)) };
+}
+};
+
+template<typename RA>
+struct deserialize_arg_with_adapter<RA, void> {
+static GRunArg exec(cv::gapi::s11n::IIStream&) {
+    GAPI_Assert(false && "No suitable adapter class found during RMat/MediaFrame deserialization. "
+                         "Please, make sure you've passed them in cv::gapi::deserialize() template");
+    return GRunArg{};
+}
+};
+
+template<typename... Types>
 struct deserialize_runarg {
 static GRunArg exec(cv::gapi::s11n::IIStream& is, uint32_t idx) {
     if (idx == GRunArg::index_of<RMat>()) {
-        auto ptr = std::make_shared<RMatAdapterType>();
-        ptr->deserialize(is);
-        return GRunArg { RMat(std::move(ptr)) };
-    } else { // non-RMat arg - use default deserialization
+        // Type or void (if not found)
+        using TA = typename cv::util::find_adapter_impl<RMat::Adapter, Types...>::type;
+        return deserialize_arg_with_adapter<RMat, TA>::exec(is);
+    } else if (idx == GRunArg::index_of<MediaFrame>()) {
+        // Type or void (if not found)
+        using TA = typename cv::util::find_adapter_impl<MediaFrame::IAdapter, Types...>::type;
+        return deserialize_arg_with_adapter<MediaFrame, TA>::exec(is);
+    } else { // not an adapter holding type runarg - use default deserialization
         GRunArg arg;
         getRunArgByIdx(is, arg, idx);
         return arg;
@@ -350,9 +480,9 @@ cv::GCompileArgs getCompileArgs(const std::vector<char> &sArgs) {
     return args;
 }
 
-template<typename RMatAdapterType>
-cv::GRunArgs getRunArgsWithRMats(const std::vector<char> &p) {
-    std::unique_ptr<cv::gapi::s11n::IIStream> pIs = cv::gapi::s11n::detail::getInStream(p);
+template<typename... AdapterTypes>
+cv::GRunArgs getRunArgsWithAdapters(const std::vector<char> &bytes) {
+    std::unique_ptr<cv::gapi::s11n::IIStream> pIs = cv::gapi::s11n::detail::getInStream(bytes);
     cv::gapi::s11n::IIStream& is = *pIs;
     cv::GRunArgs args;
 
@@ -361,12 +491,14 @@ cv::GRunArgs getRunArgsWithRMats(const std::vector<char> &p) {
     for (uint32_t i = 0; i < sz; ++i) {
         uint32_t idx = 0;
         is >> idx;
-        args.push_back(cv::gapi::detail::deserialize_runarg<RMatAdapterType>::exec(is, idx));
+        args.push_back(cv::gapi::detail::deserialize_runarg<AdapterTypes...>::exec(is, idx));
     }
 
     return args;
 }
 } // namespace detail
+/** @} */
+
 } // namespace gapi
 } // namespace cv
 
diff --git a/modules/gapi/include/opencv2/gapi/s11n/base.hpp b/modules/gapi/include/opencv2/gapi/s11n/base.hpp
index b8ec8cfaff73..11440b27e5f8 100644
--- a/modules/gapi/include/opencv2/gapi/s11n/base.hpp
+++ b/modules/gapi/include/opencv2/gapi/s11n/base.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2020-2021 Intel Corporation
 
 #ifndef OPENCV_GAPI_S11N_BASE_HPP
 #define OPENCV_GAPI_S11N_BASE_HPP
@@ -23,25 +23,54 @@ struct IIStream;
 
 namespace detail {
 
+//! @addtogroup gapi_serialization
+//! @{
+
 struct NotImplemented {
 };
 
-// The default S11N for custom types is NotImplemented
-// Don't! sublass from NotImplemented if you actually implement S11N.
+/** @brief This structure allows to implement serialization routines for custom types.
+ *
+ * The default S11N for custom types is not implemented.
+ *
+ * @note When providing an overloaded implementation for S11N with your type
+ * don't inherit it from NotImplemented structure.
+ *
+ * @note There are lots of overloaded >> and << operators for basic and OpenCV/G-API types
+ * which can be utilized when serializing a custom type.
+ *
+ * Example of usage:
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp S11N usage
+ *
+ */
 template<typename T>
 struct S11N: public NotImplemented {
+    /**
+     * @brief This function allows user to serialize their custom type.
+     *
+     * @note The default overload throws an exception if called. User need to
+     * properly overload the function to use it.
+     */
     static void serialize(IOStream &, const T &) {
         GAPI_Assert(false && "No serialization routine is provided!");
     }
+    /**
+     * @brief This function allows user to deserialize their custom type.
+     *
+     * @note The default overload throws an exception if called. User need to
+     * properly overload the function to use it.
+     */
     static T deserialize(IIStream &) {
         GAPI_Assert(false && "No deserialization routine is provided!");
     }
 };
 
+/// @private -- Exclude this struct from OpenCV documentation
 template<typename T> struct has_S11N_spec {
     static constexpr bool value = !std::is_base_of<NotImplemented,
                                         S11N<typename std::decay<T>::type>>::value;
 };
+//! @} gapi_serialization
 
 } // namespace detail
 } // namespace s11n
diff --git a/modules/gapi/include/opencv2/gapi/streaming/format.hpp b/modules/gapi/include/opencv2/gapi/streaming/format.hpp
index c9d2fa3e0a29..f7c3bd457dfb 100644
--- a/modules/gapi/include/opencv2/gapi/streaming/format.hpp
+++ b/modules/gapi/include/opencv2/gapi/streaming/format.hpp
@@ -74,7 +74,7 @@ e.g when graph's input needs to be passed directly to output, like in Streaming
 @param in Input image
 @return Copy of the input
 */
-GAPI_EXPORTS GMat copy(const GMat& in);
+GAPI_EXPORTS_W GMat copy(const GMat& in);
 
 /** @brief Makes a copy of the input frame. Note that this copy may be not real
 (no actual data copied). Use this function to maintain graph contracts,
diff --git a/modules/gapi/include/opencv2/gapi/streaming/onevpl/onevpl_source.hpp b/modules/gapi/include/opencv2/gapi/streaming/onevpl/onevpl_source.hpp
new file mode 100644
index 000000000000..fec8c73dffeb
--- /dev/null
+++ b/modules/gapi/include/opencv2/gapi/streaming/onevpl/onevpl_source.hpp
@@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
+
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+#include <opencv2/gapi/streaming/source.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+class GAPI_EXPORTS OneVPLSource : public IStreamSource
+{
+public:
+    struct Priv;
+
+    explicit OneVPLSource(const std::string& filePath);
+    ~OneVPLSource() override;
+
+    bool pull(cv::gapi::wip::Data& data) override;
+    GMetaArg descr_of() const override;
+
+private:
+    explicit OneVPLSource(std::unique_ptr<Priv>&& impl);
+    std::unique_ptr<Priv> m_priv;
+};
+
+template<class... Args>
+GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_vpl_src(const std::string& filePath, Args&&... args)
+{
+    return make_src<OneVPLSource>(filePath, std::forward<Args>(args)...);
+}
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_HPP
diff --git a/modules/gapi/include/opencv2/gapi/util/util.hpp b/modules/gapi/include/opencv2/gapi/util/util.hpp
index afcf5596fd60..eb435a3eeff0 100644
--- a/modules/gapi/include/opencv2/gapi/util/util.hpp
+++ b/modules/gapi/include/opencv2/gapi/util/util.hpp
@@ -117,6 +117,65 @@ namespace detail
         static type get(std::tuple<Objs...>&& objs) { return std::forward<std::tuple<Objs...>>(objs); }
     };
 } // namespace detail
+
+namespace util
+{
+template<typename ...L>
+struct overload_lamba_set;
+
+template<typename L1>
+struct overload_lamba_set<L1> : public L1
+{
+    overload_lamba_set(L1&& lambda) : L1(std::move(lambda)) {}
+    overload_lamba_set(const L1& lambda) : L1(lambda) {}
+
+    using L1::operator();
+};
+
+template<typename L1, typename ...L>
+struct overload_lamba_set<L1, L...> : public L1, public overload_lamba_set<L...>
+{
+    using base_type = overload_lamba_set<L...>;
+    overload_lamba_set(L1 &&lambda1, L&& ...lambdas):
+        L1(std::move(lambda1)),
+        base_type(std::forward<L>(lambdas)...) {}
+
+    overload_lamba_set(const L1 &lambda1, L&& ...lambdas):
+        L1(lambda1),
+        base_type(std::forward<L>(lambdas)...) {}
+
+    using L1::operator();
+    using base_type::operator();
+};
+
+template<typename... L>
+overload_lamba_set<L...> overload_lambdas(L&& ...lambdas)
+{
+    return overload_lamba_set<L...>(std::forward<L>(lambdas)...);
+}
+
+template<typename ...T>
+struct find_adapter_impl;
+
+template<typename AdapterT, typename T>
+struct find_adapter_impl<AdapterT, T>
+{
+    using type = typename std::conditional<std::is_base_of<AdapterT, T>::value,
+                                           T,
+                                           void>::type;
+    static constexpr bool found = std::is_base_of<AdapterT, T>::value;
+};
+
+template<typename AdapterT, typename T, typename... Types>
+struct find_adapter_impl<AdapterT, T, Types...>
+{
+    using type = typename std::conditional<std::is_base_of<AdapterT, T>::value,
+                                           T,
+                                           typename find_adapter_impl<AdapterT, Types...>::type>::type;
+    static constexpr bool found = std::is_base_of<AdapterT, T>::value ||
+                                  find_adapter_impl<AdapterT, Types...>::found;
+};
+} // namespace util
 } // namespace cv
 
 // \endcond
diff --git a/modules/gapi/include/opencv2/gapi/util/variant.hpp b/modules/gapi/include/opencv2/gapi/util/variant.hpp
index 71a06d2dcf22..f412110deb76 100644
--- a/modules/gapi/include/opencv2/gapi/util/variant.hpp
+++ b/modules/gapi/include/opencv2/gapi/util/variant.hpp
@@ -11,6 +11,7 @@
 #include <array>
 #include <type_traits>
 
+#include <opencv2/gapi/util/compiler_hints.hpp>
 #include <opencv2/gapi/util/throw.hpp>
 #include <opencv2/gapi/util/util.hpp> // max_of_t
 #include <opencv2/gapi/util/type_traits.hpp>
@@ -44,6 +45,12 @@ namespace util
         static const constexpr std::size_t value = detail::type_list_index_helper<0, Target, Types...>::value;
     };
 
+    template<std::size_t Index, class... Types >
+    struct type_list_element
+    {
+        using type = typename std::tuple_element<Index, std::tuple<Types...> >::type;
+    };
+
     class bad_variant_access: public std::exception
     {
     public:
@@ -233,9 +240,87 @@ namespace util
     template<typename T, typename... Types>
     const T& get(const util::variant<Types...> &v);
 
+    template<std::size_t Index, typename... Types>
+    typename util::type_list_element<Index, Types...>::type& get(util::variant<Types...> &v);
+
+    template<std::size_t Index, typename... Types>
+    const typename util::type_list_element<Index, Types...>::type& get(const util::variant<Types...> &v);
+
     template<typename T, typename... Types>
     bool holds_alternative(const util::variant<Types...> &v) noexcept;
 
+
+    // Visitor
+    namespace detail
+    {
+        struct visitor_interface {};
+
+        // Class `visitor_return_type_deduction_helper`
+        // introduces solution for deduction `return_type` in `visit` function in common way
+        // for both Lambda and class Visitor and keep one interface invocation point: `visit` only
+        // his helper class is required to unify return_type deduction mechanism because
+        // for Lambda it is possible to take type of `decltype(visitor(get<0>(var)))`
+        // but for class Visitor there is no operator() in base case,
+        // because it provides `operator() (std::size_t index, ...)`
+        // So `visitor_return_type_deduction_helper` expose `operator()`
+        // uses only for class Visitor only for deduction `return type` in visit()
+        template<typename R>
+        struct visitor_return_type_deduction_helper
+        {
+            using return_type = R;
+
+            // to be used in Lambda return type deduction context only
+            template<typename T>
+            return_type operator() (T&&);
+        };
+    }
+
+    // Special purpose `static_visitor` can receive additional arguments
+    template<typename R, typename Impl>
+    struct static_visitor : public detail::visitor_interface,
+                            public detail::visitor_return_type_deduction_helper<R> {
+
+        // assign responsibility for return type deduction to helper class
+        using return_type = typename detail::visitor_return_type_deduction_helper<R>::return_type;
+        using detail::visitor_return_type_deduction_helper<R>::operator();
+        friend Impl;
+
+        template<typename VariantValue, typename ...Args>
+        return_type operator() (std::size_t index, VariantValue&& value, Args&& ...args)
+        {
+            suppress_unused_warning(index);
+            return static_cast<Impl*>(this)-> visit(
+                                                std::forward<VariantValue>(value),
+                                                std::forward<Args>(args)...);
+        }
+    };
+
+    // Special purpose `static_indexed_visitor` can receive additional arguments
+    // And make forwarding current variant index as runtime function argument to its `Impl`
+    template<typename R, typename Impl>
+    struct static_indexed_visitor : public detail::visitor_interface,
+                                    public detail::visitor_return_type_deduction_helper<R> {
+
+        // assign responsibility for return type deduction to helper class
+        using return_type = typename detail::visitor_return_type_deduction_helper<R>::return_type;
+        using detail::visitor_return_type_deduction_helper<R>::operator();
+        friend Impl;
+
+        template<typename VariantValue, typename ...Args>
+        return_type operator() (std::size_t Index, VariantValue&& value, Args&& ...args)
+        {
+            return static_cast<Impl*>(this)-> visit(Index,
+                                                std::forward<VariantValue>(value),
+                                                std::forward<Args>(args)...);
+        }
+    };
+
+    template <class T>
+    struct variant_size;
+
+    template <class... Types>
+    struct variant_size<util::variant<Types...>>
+        : std::integral_constant<std::size_t, sizeof...(Types)> { };
     // FIXME: T&&, const TT&& versions.
 
     // Implementation //////////////////////////////////////////////////////////
@@ -402,6 +487,22 @@ namespace util
             throw_error(bad_variant_access());
     }
 
+    template<std::size_t Index, typename... Types>
+    typename util::type_list_element<Index, Types...>::type& get(util::variant<Types...> &v)
+    {
+        using ReturnType = typename util::type_list_element<Index, Types...>::type;
+        return const_cast<ReturnType&>(get<Index, Types...>(static_cast<const util::variant<Types...> &>(v)));
+    }
+
+    template<std::size_t Index, typename... Types>
+    const typename util::type_list_element<Index, Types...>::type& get(const util::variant<Types...> &v)
+    {
+        static_assert(Index < sizeof...(Types),
+                      "`Index` it out of bound of `util::variant` type list");
+        using ReturnType = typename util::type_list_element<Index, Types...>::type;
+        return get<ReturnType>(v);
+    }
+
     template<typename T, typename... Types>
     bool holds_alternative(const util::variant<Types...> &v) noexcept
     {
@@ -428,7 +529,130 @@ namespace util
     {
         return !(lhs == rhs);
     }
-} // namespace cv
+
+namespace detail
+{
+    // terminate recursion implementation for `non-void` ReturnType
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, typename... VisitorArgs>
+    ReturnType apply_visitor_impl(Visitor&&, Variant&,
+                                  std::true_type, std::false_type,
+                                  VisitorArgs&& ...)
+    {
+        return {};
+    }
+
+    // terminate recursion implementation for `void` ReturnType
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, typename... VisitorArgs>
+    void apply_visitor_impl(Visitor&&, Variant&,
+                            std::true_type, std::true_type,
+                            VisitorArgs&& ...)
+    {
+    }
+
+    // Intermediate resursion processor for Lambda Visitors
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, bool no_return_value, typename... VisitorArgs>
+    typename std::enable_if<!std::is_base_of<visitor_interface, typename std::decay<Visitor>::type>::value, ReturnType>::type
+         apply_visitor_impl(Visitor&& visitor, Variant&& v, std::false_type not_processed,
+                                               std::integral_constant<bool, no_return_value> should_no_return,
+                                               VisitorArgs&& ...args)
+    {
+        static_assert(std::is_same<ReturnType, decltype(visitor(get<CurIndex>(v)))>::value,
+                      "Different `ReturnType`s detected! All `Visitor::visit` or `overload_lamba_set`"
+                      " must return the same type");
+        suppress_unused_warning(not_processed);
+        if (v.index() == CurIndex)
+        {
+            return visitor.operator()(get<CurIndex>(v), std::forward<VisitorArgs>(args)... );
+        }
+
+        using is_variant_processed_t = std::integral_constant<bool, CurIndex + 1 >= ElemCount>;
+        return apply_visitor_impl<ReturnType, CurIndex +1, ElemCount>(
+                                  std::forward<Visitor>(visitor),
+                                  std::forward<Variant>(v),
+                                  is_variant_processed_t{},
+                                  should_no_return,
+                                  std::forward<VisitorArgs>(args)...);
+    }
+
+    //Visual Studio 2014 compilation fix: cast visitor to base class before invoke operator()
+    template<std::size_t CurIndex, typename ReturnType, typename Visitor, class Value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<static_visitor<ReturnType, typename std::decay<Visitor>::type>,
+                                            typename std::decay<Visitor>::type>::value, ReturnType>::type
+    invoke_class_visitor(Visitor& visitor, Value&& v,  VisitorArgs&&...args)
+    {
+        return static_cast<static_visitor<ReturnType, typename std::decay<Visitor>::type>&>(visitor).operator() (CurIndex, std::forward<Value>(v), std::forward<VisitorArgs>(args)... );
+    }
+
+    //Visual Studio 2014 compilation fix: cast visitor to base class before invoke operator()
+    template<std::size_t CurIndex, typename ReturnType, typename Visitor, class Value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<static_indexed_visitor<ReturnType, typename std::decay<Visitor>::type>,
+                                            typename std::decay<Visitor>::type>::value, ReturnType>::type
+    invoke_class_visitor(Visitor& visitor, Value&& v,  VisitorArgs&&...args)
+    {
+        return static_cast<static_indexed_visitor<ReturnType, typename std::decay<Visitor>::type>&>(visitor).operator() (CurIndex, std::forward<Value>(v), std::forward<VisitorArgs>(args)... );
+    }
+
+    // Intermediate recursion processor for special case `visitor_interface` derived Visitors
+    template<typename ReturnType, std::size_t CurIndex, std::size_t ElemCount,
+             typename Visitor, typename Variant, bool no_return_value, typename... VisitorArgs>
+    typename std::enable_if<std::is_base_of<visitor_interface, typename std::decay<Visitor>::type>::value, ReturnType>::type
+         apply_visitor_impl(Visitor&& visitor, Variant&& v, std::false_type not_processed,
+                                               std::integral_constant<bool, no_return_value> should_no_return,
+                                               VisitorArgs&& ...args)
+    {
+        static_assert(std::is_same<ReturnType, decltype(visitor(get<CurIndex>(v)))>::value,
+                      "Different `ReturnType`s detected! All `Visitor::visit` or `overload_lamba_set`"
+                      " must return the same type");
+        suppress_unused_warning(not_processed);
+        if (v.index() == CurIndex)
+        {
+            return invoke_class_visitor<CurIndex, ReturnType>(visitor, get<CurIndex>(v), std::forward<VisitorArgs>(args)... );
+        }
+
+        using is_variant_processed_t = std::integral_constant<bool, CurIndex + 1 >= ElemCount>;
+        return apply_visitor_impl<ReturnType, CurIndex +1, ElemCount>(
+                                  std::forward<Visitor>(visitor),
+                                  std::forward<Variant>(v),
+                                  is_variant_processed_t{},
+                                  should_no_return,
+                                  std::forward<VisitorArgs>(args)...);
+    }
+} // namespace detail
+
+    template<typename Visitor, typename Variant, typename... VisitorArg>
+    auto visit(Visitor &visitor, const Variant& var, VisitorArg &&...args) -> decltype(visitor(get<0>(var)))
+    {
+        constexpr std::size_t varsize = util::variant_size<Variant>::value;
+        static_assert(varsize != 0, "utils::variant must contains one type at least ");
+        using is_variant_processed_t = std::false_type;
+
+        using ReturnType = decltype(visitor(get<0>(var)));
+        using return_t = std::is_same<ReturnType, void>;
+        return detail::apply_visitor_impl<ReturnType, 0, varsize, Visitor>(
+                                    std::forward<Visitor>(visitor),
+                                    var, is_variant_processed_t{},
+                                    return_t{},
+                                    std::forward<VisitorArg>(args)...);
+    }
+
+    template<typename Visitor, typename Variant>
+    auto visit(Visitor&& visitor, const Variant& var) -> decltype(visitor(get<0>(var)))
+    {
+        constexpr std::size_t varsize = util::variant_size<Variant>::value;
+        static_assert(varsize != 0, "utils::variant must contains one type at least ");
+        using is_variant_processed_t = std::false_type;
+
+        using ReturnType = decltype(visitor(get<0>(var)));
+        using return_t = std::is_same<ReturnType, void>;
+        return detail::apply_visitor_impl<ReturnType, 0, varsize, Visitor>(
+                                    std::forward<Visitor>(visitor),
+                                    var, is_variant_processed_t{},
+                                    return_t{});
+    }
 } // namespace util
+} // namespace cv
 
 #endif // OPENCV_GAPI_UTIL_VARIANT_HPP
diff --git a/modules/gapi/misc/python/package/gapi/__init__.py b/modules/gapi/misc/python/package/gapi/__init__.py
index 733c980010af..dc874f0b0ca5 100644
--- a/modules/gapi/misc/python/package/gapi/__init__.py
+++ b/modules/gapi/misc/python/package/gapi/__init__.py
@@ -11,6 +11,36 @@ def parameterized(func):
     return parameterized
 
 
+@register('cv2.gapi')
+def networks(*args):
+    return cv.gapi_GNetPackage(list(map(cv.detail.strip, args)))
+
+
+@register('cv2.gapi')
+def compile_args(*args):
+    return list(map(cv.GCompileArg, args))
+
+
+@register('cv2')
+def GIn(*args):
+    return [*args]
+
+
+@register('cv2')
+def GOut(*args):
+    return [*args]
+
+
+@register('cv2')
+def gin(*args):
+    return [*args]
+
+
+@register('cv2.gapi')
+def descr_of(*args):
+    return [*args]
+
+
 @register('cv2')
 class GOpaque():
     # NB: Inheritance from c++ class cause segfault.
@@ -54,6 +84,10 @@ class Rect():
         def __new__(self):
             return cv.GOpaqueT(cv.gapi.CV_RECT)
 
+    class Prim():
+        def __new__(self):
+            return cv.GOpaqueT(cv.gapi.CV_DRAW_PRIM)
+
     class Any():
         def __new__(self):
             return cv.GOpaqueT(cv.gapi.CV_ANY)
@@ -113,6 +147,10 @@ class GMat():
         def __new__(self):
             return cv.GArrayT(cv.gapi.CV_GMAT)
 
+    class Prim():
+        def __new__(self):
+            return cv.GArray(cv.gapi.CV_DRAW_PRIM)
+
     class Any():
         def __new__(self):
             return cv.GArray(cv.gapi.CV_ANY)
@@ -134,6 +172,7 @@ def op(op_id, in_types, out_types):
             cv.GArray.Scalar:  cv.gapi.CV_SCALAR,
             cv.GArray.Mat:     cv.gapi.CV_MAT,
             cv.GArray.GMat:    cv.gapi.CV_GMAT,
+            cv.GArray.Prim:    cv.gapi.CV_DRAW_PRIM,
             cv.GArray.Any:     cv.gapi.CV_ANY
     }
 
@@ -149,22 +188,24 @@ def op(op_id, in_types, out_types):
             cv.GOpaque.Point2f: cv.gapi.CV_POINT2F,
             cv.GOpaque.Size:    cv.gapi.CV_SIZE,
             cv.GOpaque.Rect:    cv.gapi.CV_RECT,
+            cv.GOpaque.Prim:    cv.gapi.CV_DRAW_PRIM,
             cv.GOpaque.Any:     cv.gapi.CV_ANY
     }
 
     type2str = {
-        cv.gapi.CV_BOOL:    'cv.gapi.CV_BOOL' ,
-        cv.gapi.CV_INT:     'cv.gapi.CV_INT' ,
-        cv.gapi.CV_DOUBLE:  'cv.gapi.CV_DOUBLE' ,
-        cv.gapi.CV_FLOAT:   'cv.gapi.CV_FLOAT' ,
-        cv.gapi.CV_STRING:  'cv.gapi.CV_STRING' ,
-        cv.gapi.CV_POINT:   'cv.gapi.CV_POINT' ,
-        cv.gapi.CV_POINT2F: 'cv.gapi.CV_POINT2F' ,
-        cv.gapi.CV_SIZE:    'cv.gapi.CV_SIZE',
-        cv.gapi.CV_RECT:    'cv.gapi.CV_RECT',
-        cv.gapi.CV_SCALAR:  'cv.gapi.CV_SCALAR',
-        cv.gapi.CV_MAT:     'cv.gapi.CV_MAT',
-        cv.gapi.CV_GMAT:    'cv.gapi.CV_GMAT'
+        cv.gapi.CV_BOOL:      'cv.gapi.CV_BOOL' ,
+        cv.gapi.CV_INT:       'cv.gapi.CV_INT' ,
+        cv.gapi.CV_DOUBLE:    'cv.gapi.CV_DOUBLE' ,
+        cv.gapi.CV_FLOAT:     'cv.gapi.CV_FLOAT' ,
+        cv.gapi.CV_STRING:    'cv.gapi.CV_STRING' ,
+        cv.gapi.CV_POINT:     'cv.gapi.CV_POINT' ,
+        cv.gapi.CV_POINT2F:   'cv.gapi.CV_POINT2F' ,
+        cv.gapi.CV_SIZE:      'cv.gapi.CV_SIZE',
+        cv.gapi.CV_RECT:      'cv.gapi.CV_RECT',
+        cv.gapi.CV_SCALAR:    'cv.gapi.CV_SCALAR',
+        cv.gapi.CV_MAT:       'cv.gapi.CV_MAT',
+        cv.gapi.CV_GMAT:      'cv.gapi.CV_GMAT',
+        cv.gapi.CV_DRAW_PRIM: 'cv.gapi.CV_DRAW_PRIM'
     }
 
     # NB: Second lvl decorator takes class to decorate
@@ -244,3 +285,13 @@ def kernel_with_params(cls):
         return cls
 
     return kernel_with_params
+
+
+# FIXME: On the c++ side every class is placed in cv2 module.
+cv.gapi.wip.draw.Rect = cv.gapi_wip_draw_Rect
+cv.gapi.wip.draw.Text = cv.gapi_wip_draw_Text
+cv.gapi.wip.draw.Circle = cv.gapi_wip_draw_Circle
+cv.gapi.wip.draw.Line = cv.gapi_wip_draw_Line
+cv.gapi.wip.draw.Mosaic = cv.gapi_wip_draw_Mosaic
+cv.gapi.wip.draw.Image = cv.gapi_wip_draw_Image
+cv.gapi.wip.draw.Poly = cv.gapi_wip_draw_Poly
diff --git a/modules/gapi/misc/python/pyopencv_gapi.hpp b/modules/gapi/misc/python/pyopencv_gapi.hpp
index 6b782cfc8dd8..d378a91b5fd6 100644
--- a/modules/gapi/misc/python/pyopencv_gapi.hpp
+++ b/modules/gapi/misc/python/pyopencv_gapi.hpp
@@ -17,6 +17,7 @@ using gapi_ie_PyParams           = cv::gapi::ie::PyParams;
 using gapi_wip_IStreamSource_Ptr = cv::Ptr<cv::gapi::wip::IStreamSource>;
 using detail_ExtractArgsCallback = cv::detail::ExtractArgsCallback;
 using detail_ExtractMetaCallback = cv::detail::ExtractMetaCallback;
+using vector_GNetParam           = std::vector<cv::gapi::GNetParam>;
 
 // NB: Python wrapper generate T_U for T<U>
 // This behavior is only observed for inputs
@@ -42,6 +43,7 @@ using GArray_Rect    = cv::GArray<cv::Rect>;
 using GArray_Scalar  = cv::GArray<cv::Scalar>;
 using GArray_Mat     = cv::GArray<cv::Mat>;
 using GArray_GMat    = cv::GArray<cv::GMat>;
+using GArray_Prim    = cv::GArray<cv::gapi::wip::draw::Prim>;
 
 // FIXME: Python wrapper generate code without namespace std,
 // so it cause error: "string wasn't declared"
@@ -124,6 +126,66 @@ PyObject* pyopencv_from(const cv::detail::PyObjectHolder& v)
     return o;
 }
 
+// #FIXME: Is it possible to implement pyopencv_from/pyopencv_to for generic
+// cv::variant<Types...> ?
+template <>
+PyObject* pyopencv_from(const cv::gapi::wip::draw::Prim& prim)
+{
+    switch (prim.index())
+    {
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Rect>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Rect>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Text>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Text>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Circle>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Circle>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Line>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Line>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Poly>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Poly>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Mosaic>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Mosaic>(prim));
+        case cv::gapi::wip::draw::Prim::index_of<cv::gapi::wip::draw::Image>():
+            return pyopencv_from(cv::util::get<cv::gapi::wip::draw::Image>(prim));
+    }
+
+    util::throw_error(std::logic_error("Unsupported draw primitive type"));
+}
+
+template <>
+PyObject* pyopencv_from(const cv::gapi::wip::draw::Prims& value)
+{
+    return pyopencv_from_generic_vec(value);
+}
+
+template<>
+bool pyopencv_to(PyObject* obj, cv::gapi::wip::draw::Prim& value, const ArgInfo& info)
+{
+#define TRY_EXTRACT(Prim)                                                                                  \
+    if (PyObject_TypeCheck(obj, reinterpret_cast<PyTypeObject*>(pyopencv_gapi_wip_draw_##Prim##_TypePtr))) \
+    {                                                                                                      \
+        value = reinterpret_cast<pyopencv_gapi_wip_draw_##Prim##_t*>(obj)->v;                              \
+        return true;                                                                                       \
+    }                                                                                                      \
+
+    TRY_EXTRACT(Rect)
+    TRY_EXTRACT(Text)
+    TRY_EXTRACT(Circle)
+    TRY_EXTRACT(Line)
+    TRY_EXTRACT(Mosaic)
+    TRY_EXTRACT(Image)
+    TRY_EXTRACT(Poly)
+
+    failmsg("Unsupported primitive type");
+    return false;
+}
+
+template <>
+bool pyopencv_to(PyObject* obj, cv::gapi::wip::draw::Prims& value, const ArgInfo& info)
+{
+    return pyopencv_to_generic_vec(obj, value, info);
+}
+
 template<>
 PyObject* pyopencv_from(const cv::GArg& value)
 {
@@ -136,20 +198,21 @@ PyObject* pyopencv_from(const cv::GArg& value)
 #define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
     switch (value.opaque_kind)
     {
-        HANDLE_CASE(BOOL,    bool);
-        HANDLE_CASE(INT,     int);
-        HANDLE_CASE(DOUBLE,  double);
-        HANDLE_CASE(FLOAT,   float);
-        HANDLE_CASE(STRING,  std::string);
-        HANDLE_CASE(POINT,   cv::Point);
-        HANDLE_CASE(POINT2F, cv::Point2f);
-        HANDLE_CASE(SIZE,    cv::Size);
-        HANDLE_CASE(RECT,    cv::Rect);
-        HANDLE_CASE(SCALAR,  cv::Scalar);
-        HANDLE_CASE(MAT,     cv::Mat);
-        HANDLE_CASE(UNKNOWN, cv::detail::PyObjectHolder);
+        HANDLE_CASE(BOOL,      bool);
+        HANDLE_CASE(INT,       int);
+        HANDLE_CASE(INT64,   int64_t);
+        HANDLE_CASE(DOUBLE,    double);
+        HANDLE_CASE(FLOAT,     float);
+        HANDLE_CASE(STRING,    std::string);
+        HANDLE_CASE(POINT,     cv::Point);
+        HANDLE_CASE(POINT2F,   cv::Point2f);
+        HANDLE_CASE(SIZE,      cv::Size);
+        HANDLE_CASE(RECT,      cv::Rect);
+        HANDLE_CASE(SCALAR,    cv::Scalar);
+        HANDLE_CASE(MAT,       cv::Mat);
+        HANDLE_CASE(UNKNOWN,   cv::detail::PyObjectHolder);
+        HANDLE_CASE(DRAW_PRIM, cv::gapi::wip::draw::Prim);
         UNSUPPORTED(UINT64);
-        UNSUPPORTED(DRAW_PRIM);
 #undef HANDLE_CASE
 #undef UNSUPPORTED
     }
@@ -164,23 +227,29 @@ bool pyopencv_to(PyObject* obj, cv::GArg& value, const ArgInfo& info)
 }
 
 template <>
-bool pyopencv_to(PyObject* obj, std::vector<GCompileArg>& value, const ArgInfo& info)
+bool pyopencv_to(PyObject* obj, std::vector<cv::gapi::GNetParam>& value, const ArgInfo& info)
 {
     return pyopencv_to_generic_vec(obj, value, info);
 }
 
 template <>
-PyObject* pyopencv_from(const std::vector<GCompileArg>& value)
+PyObject* pyopencv_from(const std::vector<cv::gapi::GNetParam>& value)
 {
     return pyopencv_from_generic_vec(value);
 }
 
 template <>
-bool pyopencv_to(PyObject* obj, GRunArgs& value, const ArgInfo& info)
+bool pyopencv_to(PyObject* obj, std::vector<GCompileArg>& value, const ArgInfo& info)
 {
     return pyopencv_to_generic_vec(obj, value, info);
 }
 
+template <>
+PyObject* pyopencv_from(const std::vector<GCompileArg>& value)
+{
+    return pyopencv_from_generic_vec(value);
+}
+
 template<>
 PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
 {
@@ -188,6 +257,7 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
     {
         case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from(o.rref<bool>());
         case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from(o.rref<int>());
+        case cv::detail::OpaqueKind::CV_INT64     : return pyopencv_from(o.rref<int64_t>());
         case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from(o.rref<double>());
         case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from(o.rref<float>());
         case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from(o.rref<std::string>());
@@ -196,10 +266,10 @@ PyObject* pyopencv_from(const cv::detail::OpaqueRef& o)
         case cv::detail::OpaqueKind::CV_SIZE      : return pyopencv_from(o.rref<cv::Size>());
         case cv::detail::OpaqueKind::CV_RECT      : return pyopencv_from(o.rref<cv::Rect>());
         case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from(o.rref<cv::GArg>());
+        case cv::detail::OpaqueKind::CV_DRAW_PRIM : return pyopencv_from(o.rref<cv::gapi::wip::draw::Prim>());
         case cv::detail::OpaqueKind::CV_UINT64    : break;
         case cv::detail::OpaqueKind::CV_SCALAR    : break;
         case cv::detail::OpaqueKind::CV_MAT       : break;
-        case cv::detail::OpaqueKind::CV_DRAW_PRIM : break;
     }
 
     PyErr_SetString(PyExc_TypeError, "Unsupported GOpaque type");
@@ -213,6 +283,7 @@ PyObject* pyopencv_from(const cv::detail::VectorRef& v)
     {
         case cv::detail::OpaqueKind::CV_BOOL      : return pyopencv_from_generic_vec(v.rref<bool>());
         case cv::detail::OpaqueKind::CV_INT       : return pyopencv_from_generic_vec(v.rref<int>());
+        case cv::detail::OpaqueKind::CV_INT64     : return pyopencv_from_generic_vec(v.rref<int64_t>());
         case cv::detail::OpaqueKind::CV_DOUBLE    : return pyopencv_from_generic_vec(v.rref<double>());
         case cv::detail::OpaqueKind::CV_FLOAT     : return pyopencv_from_generic_vec(v.rref<float>());
         case cv::detail::OpaqueKind::CV_STRING    : return pyopencv_from_generic_vec(v.rref<std::string>());
@@ -223,8 +294,8 @@ PyObject* pyopencv_from(const cv::detail::VectorRef& v)
         case cv::detail::OpaqueKind::CV_SCALAR    : return pyopencv_from_generic_vec(v.rref<cv::Scalar>());
         case cv::detail::OpaqueKind::CV_MAT       : return pyopencv_from_generic_vec(v.rref<cv::Mat>());
         case cv::detail::OpaqueKind::CV_UNKNOWN   : return pyopencv_from_generic_vec(v.rref<cv::GArg>());
+        case cv::detail::OpaqueKind::CV_DRAW_PRIM : return pyopencv_from_generic_vec(v.rref<cv::gapi::wip::draw::Prim>());
         case cv::detail::OpaqueKind::CV_UINT64    : break;
-        case cv::detail::OpaqueKind::CV_DRAW_PRIM : break;
     }
 
     PyErr_SetString(PyExc_TypeError, "Unsupported GArray type");
@@ -249,52 +320,69 @@ PyObject* pyopencv_from(const GRunArg& v)
             return pyopencv_from(util::get<cv::detail::OpaqueRef>(v));
     }
 
-    PyErr_SetString(PyExc_TypeError, "Failed to unpack GRunArgs");
+    PyErr_SetString(PyExc_TypeError, "Failed to unpack GRunArgs. Index of variant is unknown");
     return NULL;
 }
 
-template<>
-PyObject* pyopencv_from(const GRunArgs& value)
+template <typename T>
+PyObject* pyopencv_from(const cv::optional<T>& opt)
 {
-    size_t i, n = value.size();
-
-    // NB: It doesn't make sense to return list with a single element
-    if (n == 1)
+    if (!opt.has_value())
     {
-        PyObject* item = pyopencv_from(value[0]);
-        if(!item)
-        {
-            return NULL;
-        }
-        return item;
+        Py_RETURN_NONE;
     }
+    return pyopencv_from(*opt);
+}
 
-    PyObject* list = PyList_New(n);
-    for(i = 0; i < n; ++i)
+template <>
+PyObject* pyopencv_from(const GOptRunArg& v)
+{
+    switch (v.index())
     {
-        PyObject* item = pyopencv_from(value[i]);
-        if(!item)
-        {
-            Py_DECREF(list);
-            PyErr_SetString(PyExc_TypeError, "Failed to unpack GRunArgs");
-            return NULL;
-        }
-        PyList_SetItem(list, i, item);
+        case GOptRunArg::index_of<cv::optional<cv::Mat>>():
+            return pyopencv_from(util::get<cv::optional<cv::Mat>>(v));
+
+        case GOptRunArg::index_of<cv::optional<cv::Scalar>>():
+            return pyopencv_from(util::get<cv::optional<cv::Scalar>>(v));
+
+        case GOptRunArg::index_of<optional<cv::detail::VectorRef>>():
+            return pyopencv_from(util::get<optional<cv::detail::VectorRef>>(v));
+
+        case GOptRunArg::index_of<optional<cv::detail::OpaqueRef>>():
+            return pyopencv_from(util::get<optional<cv::detail::OpaqueRef>>(v));
     }
 
-    return list;
+    PyErr_SetString(PyExc_TypeError, "Failed to unpack GOptRunArg. Index of variant is unknown");
+    return NULL;
 }
 
 template<>
-bool pyopencv_to(PyObject* obj, GMetaArgs& value, const ArgInfo& info)
+PyObject* pyopencv_from(const GRunArgs& value)
 {
-    return pyopencv_to_generic_vec(obj, value, info);
+     return value.size() == 1 ? pyopencv_from(value[0]) : pyopencv_from_generic_vec(value);
 }
 
 template<>
-PyObject* pyopencv_from(const GMetaArgs& value)
+PyObject* pyopencv_from(const GOptRunArgs& value)
 {
-    return pyopencv_from_generic_vec(value);
+    return value.size() == 1 ? pyopencv_from(value[0]) : pyopencv_from_generic_vec(value);
+}
+
+// FIXME: cv::variant should be wrapped once for all types.
+template <>
+PyObject* pyopencv_from(const cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>& v)
+{
+    using RunArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    switch (v.index())
+    {
+        case RunArgs::index_of<cv::GRunArgs>():
+            return pyopencv_from(util::get<cv::GRunArgs>(v));
+        case RunArgs::index_of<cv::GOptRunArgs>():
+            return pyopencv_from(util::get<cv::GOptRunArgs>(v));
+    }
+
+    PyErr_SetString(PyExc_TypeError, "Failed to recognize kind of RunArgs. Index of variant is unknown");
+    return NULL;
 }
 
 template <typename T>
@@ -318,16 +406,16 @@ void pyopencv_to_generic_vec_with_check(PyObject* from,
 }
 
 template <typename T>
-static PyObject* extract_proto_args(PyObject* py_args, PyObject* kw)
+static T extract_proto_args(PyObject* py_args)
 {
     using namespace cv;
 
     GProtoArgs args;
-    Py_ssize_t size = PyTuple_Size(py_args);
+    Py_ssize_t size = PyList_Size(py_args);
     args.reserve(size);
     for (int i = 0; i < size; ++i)
     {
-        PyObject* item = PyTuple_GetItem(py_args, i);
+        PyObject* item = PyList_GetItem(py_args, i);
         if (PyObject_TypeCheck(item, reinterpret_cast<PyTypeObject*>(pyopencv_GScalar_TypePtr)))
         {
             args.emplace_back(reinterpret_cast<pyopencv_GScalar_t*>(item)->v);
@@ -346,22 +434,11 @@ static PyObject* extract_proto_args(PyObject* py_args, PyObject* kw)
         }
         else
         {
-            PyErr_SetString(PyExc_TypeError, "Unsupported type for cv.GIn()/cv.GOut()");
-            return NULL;
+            util::throw_error(std::logic_error("Unsupported type for GProtoArgs"));
         }
     }
 
-    return pyopencv_from<T>(T{std::move(args)});
-}
-
-static PyObject* pyopencv_cv_GIn(PyObject* , PyObject* py_args, PyObject* kw)
-{
-    return extract_proto_args<GProtoInputArgs>(py_args, kw);
-}
-
-static PyObject* pyopencv_cv_GOut(PyObject* , PyObject* py_args, PyObject* kw)
-{
-    return extract_proto_args<GProtoOutputArgs>(py_args, kw);
+    return T(std::move(args));
 }
 
 static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::OpaqueKind kind)
@@ -386,6 +463,7 @@ static cv::detail::OpaqueRef extract_opaque_ref(PyObject* from, cv::detail::Opaq
         HANDLE_CASE(RECT,    cv::Rect);
         HANDLE_CASE(UNKNOWN, cv::GArg);
         UNSUPPORTED(UINT64);
+        UNSUPPORTED(INT64);
         UNSUPPORTED(SCALAR);
         UNSUPPORTED(MAT);
         UNSUPPORTED(DRAW_PRIM);
@@ -406,20 +484,21 @@ static cv::detail::VectorRef extract_vector_ref(PyObject* from, cv::detail::Opaq
 #define UNSUPPORTED(T) case cv::detail::OpaqueKind::CV_##T: break
     switch (kind)
     {
-        HANDLE_CASE(BOOL,    bool);
-        HANDLE_CASE(INT,     int);
-        HANDLE_CASE(DOUBLE,  double);
-        HANDLE_CASE(FLOAT,   float);
-        HANDLE_CASE(STRING,  std::string);
-        HANDLE_CASE(POINT,   cv::Point);
-        HANDLE_CASE(POINT2F, cv::Point2f);
-        HANDLE_CASE(SIZE,    cv::Size);
-        HANDLE_CASE(RECT,    cv::Rect);
-        HANDLE_CASE(SCALAR,  cv::Scalar);
-        HANDLE_CASE(MAT,     cv::Mat);
-        HANDLE_CASE(UNKNOWN, cv::GArg);
+        HANDLE_CASE(BOOL,      bool);
+        HANDLE_CASE(INT,       int);
+        HANDLE_CASE(DOUBLE,    double);
+        HANDLE_CASE(FLOAT,     float);
+        HANDLE_CASE(STRING,    std::string);
+        HANDLE_CASE(POINT,     cv::Point);
+        HANDLE_CASE(POINT2F,   cv::Point2f);
+        HANDLE_CASE(SIZE,      cv::Size);
+        HANDLE_CASE(RECT,      cv::Rect);
+        HANDLE_CASE(SCALAR,    cv::Scalar);
+        HANDLE_CASE(MAT,       cv::Mat);
+        HANDLE_CASE(UNKNOWN,   cv::GArg);
+        HANDLE_CASE(DRAW_PRIM, cv::gapi::wip::draw::Prim);
         UNSUPPORTED(UINT64);
-        UNSUPPORTED(DRAW_PRIM);
+        UNSUPPORTED(INT64);
 #undef HANDLE_CASE
 #undef UNSUPPORTED
     }
@@ -470,13 +549,15 @@ static cv::GRunArg extract_run_arg(const cv::GTypeInfo& info, PyObject* item)
 
 static cv::GRunArgs extract_run_args(const cv::GTypesInfo& info, PyObject* py_args)
 {
+    GAPI_Assert(PyList_Check(py_args));
+
     cv::GRunArgs args;
-    Py_ssize_t tuple_size = PyTuple_Size(py_args);
-    args.reserve(tuple_size);
+    Py_ssize_t list_size = PyList_Size(py_args);
+    args.reserve(list_size);
 
-    for (int i = 0; i < tuple_size; ++i)
+    for (int i = 0; i < list_size; ++i)
     {
-        args.push_back(extract_run_arg(info[i], PyTuple_GetItem(py_args, i)));
+        args.push_back(extract_run_arg(info[i], PyList_GetItem(py_args, i)));
     }
 
     return args;
@@ -517,13 +598,15 @@ static cv::GMetaArg extract_meta_arg(const cv::GTypeInfo& info, PyObject* item)
 
 static cv::GMetaArgs extract_meta_args(const cv::GTypesInfo& info, PyObject* py_args)
 {
+    GAPI_Assert(PyList_Check(py_args));
+
     cv::GMetaArgs metas;
-    Py_ssize_t tuple_size = PyTuple_Size(py_args);
-    metas.reserve(tuple_size);
+    Py_ssize_t list_size = PyList_Size(py_args);
+    metas.reserve(list_size);
 
-    for (int i = 0; i < tuple_size; ++i)
+    for (int i = 0; i < list_size; ++i)
     {
-        metas.push_back(extract_meta_arg(info[i], PyTuple_GetItem(py_args, i)));
+        metas.push_back(extract_meta_arg(info[i], PyList_GetItem(py_args, i)));
     }
 
     return metas;
@@ -581,7 +664,8 @@ static cv::GRunArgs run_py_kernel(cv::detail::PyObjectHolder kernel,
         cv::detail::PyObjectHolder result(
                 PyObject_CallObject(kernel.get(), args.get()), false);
 
-        if (PyErr_Occurred()) {
+        if (PyErr_Occurred())
+        {
             PyErr_PrintEx(0);
             PyErr_Clear();
             throw std::logic_error("Python kernel failed with error!");
@@ -589,8 +673,27 @@ static cv::GRunArgs run_py_kernel(cv::detail::PyObjectHolder kernel,
         // NB: In fact it's impossible situation, becase errors were handled above.
         GAPI_Assert(result.get() && "Python kernel returned NULL!");
 
-        outs = out_info.size() == 1 ? cv::GRunArgs{extract_run_arg(out_info[0], result.get())}
-                                    : extract_run_args(out_info, result.get());
+        if (out_info.size() == 1)
+        {
+            outs = cv::GRunArgs{extract_run_arg(out_info[0], result.get())};
+        }
+        else if (out_info.size() > 1)
+        {
+            GAPI_Assert(PyTuple_Check(result.get()));
+
+            Py_ssize_t tuple_size = PyTuple_Size(result.get());
+            outs.reserve(tuple_size);
+
+            for (int i = 0; i < tuple_size; ++i)
+            {
+                outs.push_back(extract_run_arg(out_info[i], PyTuple_GetItem(result.get(), i)));
+            }
+        }
+        else
+        {
+            // Seems to be impossible case.
+            GAPI_Assert(false);
+        }
     }
     catch (...)
     {
@@ -645,8 +748,9 @@ static cv::GMetaArgs get_meta_args(PyObject* tuple)
 }
 
 static GMetaArgs run_py_meta(cv::detail::PyObjectHolder out_meta,
-                            const cv::GMetaArgs         &meta,
-                            const cv::GArgs             &gargs) {
+                             const cv::GMetaArgs         &meta,
+                             const cv::GArgs             &gargs)
+{
     PyGILState_STATE gstate;
     gstate = PyGILState_Ensure();
 
@@ -688,7 +792,8 @@ static GMetaArgs run_py_meta(cv::detail::PyObjectHolder out_meta,
         cv::detail::PyObjectHolder result(
                 PyObject_CallObject(out_meta.get(), args.get()), false);
 
-        if (PyErr_Occurred()) {
+        if (PyErr_Occurred())
+        {
             PyErr_PrintEx(0);
             PyErr_Clear();
             throw std::logic_error("Python outMeta failed with error!");
@@ -720,21 +825,24 @@ static PyObject* pyopencv_cv_gapi_kernels(PyObject* , PyObject* py_args, PyObjec
         PyObject* user_kernel = PyTuple_GetItem(py_args, i);
 
         PyObject* id_obj = PyObject_GetAttrString(user_kernel, "id");
-        if (!id_obj) {
+        if (!id_obj)
+        {
             PyErr_SetString(PyExc_TypeError,
                     "Python kernel should contain id, please use cv.gapi.kernel to define kernel");
             return NULL;
         }
 
         PyObject* out_meta = PyObject_GetAttrString(user_kernel, "outMeta");
-        if (!out_meta) {
+        if (!out_meta)
+        {
             PyErr_SetString(PyExc_TypeError,
                     "Python kernel should contain outMeta, please use cv.gapi.kernel to define kernel");
             return NULL;
         }
 
         PyObject* run  = PyObject_GetAttrString(user_kernel, "run");
-        if (!run) {
+        if (!run)
+        {
             PyErr_SetString(PyExc_TypeError,
                     "Python kernel should contain run, please use cv.gapi.kernel to define kernel");
             return NULL;
@@ -756,23 +864,6 @@ static PyObject* pyopencv_cv_gapi_kernels(PyObject* , PyObject* py_args, PyObjec
     return pyopencv_from(pkg);
 }
 
-static PyObject* pyopencv_cv_gapi_networks(PyObject*, PyObject* py_args, PyObject*)
-{
-    using namespace cv;
-    gapi::GNetPackage pkg;
-    Py_ssize_t size = PyTuple_Size(py_args);
-    for (int i = 0; i < size; ++i)
-    {
-        gapi_ie_PyParams params;
-        PyObject* item = PyTuple_GetItem(py_args, i);
-        if (pyopencv_to(item, params, ArgInfo("PyParams", false)))
-        {
-            pkg += gapi::networks(params);
-        }
-    }
-    return pyopencv_from(pkg);
-}
-
 static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
 {
     using namespace cv;
@@ -834,53 +925,54 @@ static PyObject* pyopencv_cv_gapi_op(PyObject* , PyObject* py_args, PyObject*)
     return pyopencv_from(cv::gapi::wip::op(id, outMetaWrapper, std::move(args)));
 }
 
-static PyObject* pyopencv_cv_gin(PyObject*, PyObject* py_args, PyObject*)
+template<>
+bool pyopencv_to(PyObject* obj, cv::detail::ExtractArgsCallback& value, const ArgInfo&)
 {
-    cv::detail::PyObjectHolder holder{py_args};
-    auto callback = cv::detail::ExtractArgsCallback{[=](const cv::GTypesInfo& info)
-        {
-            PyGILState_STATE gstate;
-            gstate = PyGILState_Ensure();
+    cv::detail::PyObjectHolder holder{obj};
+    value = cv::detail::ExtractArgsCallback{[=](const cv::GTypesInfo& info)
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
 
-            cv::GRunArgs args;
-            try
-            {
-                args = extract_run_args(info, holder.get());
-            }
-            catch (...)
-            {
-                PyGILState_Release(gstate);
-                throw;
-            }
+        cv::GRunArgs args;
+        try
+        {
+            args = extract_run_args(info, holder.get());
+        }
+        catch (...)
+        {
             PyGILState_Release(gstate);
-            return args;
-        }};
-
-    return pyopencv_from(callback);
+            throw;
+        }
+        PyGILState_Release(gstate);
+        return args;
+    }};
+    return true;
 }
 
-static PyObject* pyopencv_cv_descr_of(PyObject*, PyObject* py_args, PyObject*)
+template<>
+bool pyopencv_to(PyObject* obj, cv::detail::ExtractMetaCallback& value, const ArgInfo&)
 {
-    Py_INCREF(py_args);
-    auto callback = cv::detail::ExtractMetaCallback{[=](const cv::GTypesInfo& info)
-        {
-            PyGILState_STATE gstate;
-            gstate = PyGILState_Ensure();
+    cv::detail::PyObjectHolder holder{obj};
+    value = cv::detail::ExtractMetaCallback{[=](const cv::GTypesInfo& info)
+    {
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
 
-            cv::GMetaArgs args;
-            try
-            {
-                args = extract_meta_args(info, py_args);
-            }
-            catch (...)
-            {
-                PyGILState_Release(gstate);
-                throw;
-            }
+        cv::GMetaArgs args;
+        try
+        {
+            args = extract_meta_args(info, holder.get());
+        }
+        catch (...)
+        {
             PyGILState_Release(gstate);
-            return args;
-        }};
-    return pyopencv_from(callback);
+            throw;
+        }
+        PyGILState_Release(gstate);
+        return args;
+    }};
+    return true;
 }
 
 template<typename T>
@@ -895,9 +987,12 @@ struct PyOpenCV_Converter<cv::GArray<T>>
         if (PyObject_TypeCheck(obj, reinterpret_cast<PyTypeObject*>(pyopencv_GArrayT_TypePtr)))
         {
             auto& array = reinterpret_cast<pyopencv_GArrayT_t*>(obj)->v;
-            try {
+            try
+            {
                 value = cv::util::get<cv::GArray<T>>(array.arg());
-            } catch (...) {
+            }
+            catch (...)
+            {
                 return false;
             }
             return true;
@@ -918,9 +1013,12 @@ struct PyOpenCV_Converter<cv::GOpaque<T>>
         if (PyObject_TypeCheck(obj, reinterpret_cast<PyTypeObject*>(pyopencv_GOpaqueT_TypePtr)))
         {
             auto& opaque = reinterpret_cast<pyopencv_GOpaqueT_t*>(obj)->v;
-            try {
+            try
+            {
                 value = cv::util::get<cv::GOpaque<T>>(opaque.arg());
-            } catch (...) {
+            }
+            catch (...)
+            {
                 return false;
             }
             return true;
@@ -929,11 +1027,39 @@ struct PyOpenCV_Converter<cv::GOpaque<T>>
     }
 };
 
+template<>
+bool pyopencv_to(PyObject* obj, cv::GProtoInputArgs& value, const ArgInfo& info)
+{
+    try
+    {
+        value = extract_proto_args<cv::GProtoInputArgs>(obj);
+        return true;
+    }
+    catch (...)
+    {
+        failmsg("Can't parse cv::GProtoInputArgs");
+        return false;
+    }
+}
+
+template<>
+bool pyopencv_to(PyObject* obj, cv::GProtoOutputArgs& value, const ArgInfo& info)
+{
+    try
+    {
+        value = extract_proto_args<cv::GProtoOutputArgs>(obj);
+        return true;
+    }
+    catch (...)
+    {
+        failmsg("Can't parse cv::GProtoOutputArgs");
+        return false;
+    }
+}
 
 // extend cv.gapi methods
 #define PYOPENCV_EXTRA_METHODS_GAPI \
   {"kernels", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_kernels), "kernels(...) -> GKernelPackage"}, \
-  {"networks", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_networks), "networks(...) -> GNetPackage"}, \
   {"__op", CV_PY_FN_WITH_KW(pyopencv_cv_gapi_op), "__op(...) -> retval\n"},
 
 
diff --git a/modules/gapi/misc/python/python_bridge.hpp b/modules/gapi/misc/python/python_bridge.hpp
index 0d1c6d51c574..11d17287308e 100644
--- a/modules/gapi/misc/python/python_bridge.hpp
+++ b/modules/gapi/misc/python/python_bridge.hpp
@@ -10,6 +10,7 @@
 #include <opencv2/gapi.hpp>
 #include <opencv2/gapi/garg.hpp>
 #include <opencv2/gapi/gopaque.hpp>
+#include <opencv2/gapi/render/render_types.hpp> // Prim
 
 #define ID(T, E)  T
 #define ID_(T, E) ID(T, E),
@@ -24,24 +25,29 @@
             GAPI_Assert(false && "Unsupported type"); \
     }
 
+using cv::gapi::wip::draw::Prim;
+
 #define GARRAY_TYPE_LIST_G(G, G2) \
-WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,    G)  \
-WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,     G)  \
-WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,  G)  \
-WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,   G)  \
-WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,  G)  \
-WRAP_ARGS(cv::Point   , cv::gapi::ArgType::CV_POINT,   G)  \
-WRAP_ARGS(cv::Point2f , cv::gapi::ArgType::CV_POINT2F, G)  \
-WRAP_ARGS(cv::Size    , cv::gapi::ArgType::CV_SIZE,    G)  \
-WRAP_ARGS(cv::Rect    , cv::gapi::ArgType::CV_RECT,    G)  \
-WRAP_ARGS(cv::Scalar  , cv::gapi::ArgType::CV_SCALAR,  G)  \
-WRAP_ARGS(cv::Mat     , cv::gapi::ArgType::CV_MAT,     G)  \
-WRAP_ARGS(cv::GArg    , cv::gapi::ArgType::CV_ANY,     G)  \
-WRAP_ARGS(cv::GMat    , cv::gapi::ArgType::CV_GMAT,    G2) \
+WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,      G)  \
+WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,       G)  \
+WRAP_ARGS(int64_t     , cv::gapi::ArgType::CV_INT64,     G)  \
+WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,    G)  \
+WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,     G)  \
+WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,    G)  \
+WRAP_ARGS(cv::Point   , cv::gapi::ArgType::CV_POINT,     G)  \
+WRAP_ARGS(cv::Point2f , cv::gapi::ArgType::CV_POINT2F,   G)  \
+WRAP_ARGS(cv::Size    , cv::gapi::ArgType::CV_SIZE,      G)  \
+WRAP_ARGS(cv::Rect    , cv::gapi::ArgType::CV_RECT,      G)  \
+WRAP_ARGS(cv::Scalar  , cv::gapi::ArgType::CV_SCALAR,    G)  \
+WRAP_ARGS(cv::Mat     , cv::gapi::ArgType::CV_MAT,       G)  \
+WRAP_ARGS(Prim        , cv::gapi::ArgType::CV_DRAW_PRIM, G)  \
+WRAP_ARGS(cv::GArg    , cv::gapi::ArgType::CV_ANY,       G)  \
+WRAP_ARGS(cv::GMat    , cv::gapi::ArgType::CV_GMAT,      G2) \
 
 #define GOPAQUE_TYPE_LIST_G(G, G2) \
 WRAP_ARGS(bool        , cv::gapi::ArgType::CV_BOOL,    G)  \
 WRAP_ARGS(int         , cv::gapi::ArgType::CV_INT,     G)  \
+WRAP_ARGS(int64_t     , cv::gapi::ArgType::CV_INT64,   G)  \
 WRAP_ARGS(double      , cv::gapi::ArgType::CV_DOUBLE,  G)  \
 WRAP_ARGS(float       , cv::gapi::ArgType::CV_FLOAT,   G)  \
 WRAP_ARGS(std::string , cv::gapi::ArgType::CV_STRING,  G)  \
@@ -58,6 +64,7 @@ namespace gapi {
 enum ArgType {
     CV_BOOL,
     CV_INT,
+    CV_INT64,
     CV_DOUBLE,
     CV_FLOAT,
     CV_STRING,
@@ -68,6 +75,7 @@ enum ArgType {
     CV_SCALAR,
     CV_MAT,
     CV_GMAT,
+    CV_DRAW_PRIM,
     CV_ANY,
 };
 
diff --git a/modules/gapi/misc/python/samples/gaze_estimation.py b/modules/gapi/misc/python/samples/gaze_estimation.py
new file mode 100644
index 000000000000..5536787e608c
--- /dev/null
+++ b/modules/gapi/misc/python/samples/gaze_estimation.py
@@ -0,0 +1,458 @@
+import argparse
+import time
+import numpy as np
+import cv2 as cv
+
+
+# ------------------------Service operations------------------------
+def weight_path(model_path):
+    """ Get path of weights based on path to IR
+
+    Params:
+    model_path: the string contains path to IR file
+
+    Return:
+    Path to weights file
+    """
+    assert model_path.endswith('.xml'), "Wrong topology path was provided"
+    return model_path[:-3] + 'bin'
+
+
+def build_argparser():
+    """ Parse arguments from command line
+
+    Return:
+    Pack of arguments from command line
+    """
+    parser = argparse.ArgumentParser(description='This is an OpenCV-based version of Gaze Estimation example')
+
+    parser.add_argument('--input',
+                        help='Path to the input video file')
+    parser.add_argument('--out',
+                        help='Path to the output video file')
+    parser.add_argument('--facem',
+                        default='face-detection-retail-0005.xml',
+                        help='Path to OpenVINO face detection model (.xml)')
+    parser.add_argument('--faced',
+                        default='CPU',
+                        help='Target device for the face detection' +
+                        '(e.g. CPU, GPU, VPU, ...)')
+    parser.add_argument('--headm',
+                        default='head-pose-estimation-adas-0001.xml',
+                        help='Path to OpenVINO head pose estimation model (.xml)')
+    parser.add_argument('--headd',
+                        default='CPU',
+                        help='Target device for the head pose estimation inference ' +
+                        '(e.g. CPU, GPU, VPU, ...)')
+    parser.add_argument('--landm',
+                        default='facial-landmarks-35-adas-0002.xml',
+                        help='Path to OpenVINO landmarks detector model (.xml)')
+    parser.add_argument('--landd',
+                        default='CPU',
+                        help='Target device for the landmarks detector (e.g. CPU, GPU, VPU, ...)')
+    parser.add_argument('--gazem',
+                        default='gaze-estimation-adas-0002.xml',
+                        help='Path to OpenVINO gaze vector estimaiton model (.xml)')
+    parser.add_argument('--gazed',
+                        default='CPU',
+                        help='Target device for the gaze vector estimation inference ' +
+                        '(e.g. CPU, GPU, VPU, ...)')
+    parser.add_argument('--eyem',
+                        default='open-closed-eye-0001.xml',
+                        help='Path to OpenVINO open closed eye model (.xml)')
+    parser.add_argument('--eyed',
+                        default='CPU',
+                        help='Target device for the eyes state inference (e.g. CPU, GPU, VPU, ...)')
+    return parser
+
+
+# ------------------------Support functions for custom kernels------------------------
+def intersection(surface, rect):
+    """ Remove zone of out of bound from ROI
+
+    Params:
+    surface: image bounds is rect representation (top left coordinates and width and height)
+    rect: region of interest is also has rect representation
+
+    Return:
+    Modified ROI with correct bounds
+    """
+    l_x = max(surface[0], rect[0])
+    l_y = max(surface[1], rect[1])
+    width = min(surface[0] + surface[2], rect[0] + rect[2]) - l_x
+    height = min(surface[1] + surface[3], rect[1] + rect[3]) - l_y
+    if width < 0 or height < 0:
+        return (0, 0, 0, 0)
+    return (l_x, l_y, width, height)
+
+
+def process_landmarks(r_x, r_y, r_w, r_h, landmarks):
+    """ Create points from result of inference of facial-landmarks network and size of input image
+
+    Params:
+    r_x: x coordinate of top left corner of input image
+    r_y: y coordinate of top left corner of input image
+    r_w: width of input image
+    r_h: height of input image
+    landmarks: result of inference of facial-landmarks network
+
+    Return:
+    Array of landmarks points for one face
+    """
+    lmrks = landmarks[0]
+    raw_x = lmrks[::2] * r_w + r_x
+    raw_y = lmrks[1::2] * r_h + r_y
+    return np.array([[int(x), int(y)] for x, y in zip(raw_x, raw_y)])
+
+
+def eye_box(p_1, p_2, scale=1.8):
+    """ Get bounding box of eye
+
+    Params:
+    p_1: point of left edge of eye
+    p_2: point of right edge of eye
+    scale: change size of box with this value
+
+    Return:
+    Bounding box of eye and its midpoint
+    """
+
+    size = np.linalg.norm(p_1 - p_2)
+    midpoint = (p_1 + p_2) / 2
+    width = scale * size
+    height = width
+    p_x = midpoint[0] - (width / 2)
+    p_y = midpoint[1] - (height / 2)
+    return (int(p_x), int(p_y), int(width), int(height)), list(map(int, midpoint))
+
+
+# ------------------------Custom graph operations------------------------
+@cv.gapi.op('custom.GProcessPoses',
+            in_types=[cv.GArray.GMat, cv.GArray.GMat, cv.GArray.GMat],
+            out_types=[cv.GArray.GMat])
+class GProcessPoses:
+    @staticmethod
+    def outMeta(arr_desc0, arr_desc1, arr_desc2):
+        return cv.empty_array_desc()
+
+
+@cv.gapi.op('custom.GParseEyes',
+            in_types=[cv.GArray.GMat, cv.GArray.Rect, cv.GOpaque.Size],
+            out_types=[cv.GArray.Rect, cv.GArray.Rect, cv.GArray.Point, cv.GArray.Point])
+class GParseEyes:
+    @staticmethod
+    def outMeta(arr_desc0, arr_desc1, arr_desc2):
+        return cv.empty_array_desc(), cv.empty_array_desc(), \
+               cv.empty_array_desc(), cv.empty_array_desc()
+
+
+@cv.gapi.op('custom.GGetStates',
+            in_types=[cv.GArray.GMat, cv.GArray.GMat],
+            out_types=[cv.GArray.Int, cv.GArray.Int])
+class GGetStates:
+    @staticmethod
+    def outMeta(arr_desc0, arr_desc1):
+        return cv.empty_array_desc(), cv.empty_array_desc()
+
+
+# ------------------------Custom kernels------------------------
+@cv.gapi.kernel(GProcessPoses)
+class GProcessPosesImpl:
+    """ Custom kernel. Processed poses of heads
+    """
+    @staticmethod
+    def run(in_ys, in_ps, in_rs):
+        """ Сustom kernel executable code
+
+        Params:
+        in_ys: yaw angle of head
+        in_ps: pitch angle of head
+        in_rs: roll angle of head
+
+        Return:
+        Arrays with heads poses
+        """
+        return [np.array([ys[0], ps[0], rs[0]]).T for ys, ps, rs in zip(in_ys, in_ps, in_rs)]
+
+
+@cv.gapi.kernel(GParseEyes)
+class GParseEyesImpl:
+    """ Custom kernel. Get information about eyes
+    """
+    @staticmethod
+    def run(in_landm_per_face, in_face_rcs, frame_size):
+        """ Сustom kernel executable code
+
+        Params:
+        in_landm_per_face: landmarks from inference of facial-landmarks network for each face
+        in_face_rcs: bounding boxes for each face
+        frame_size: size of input image
+
+        Return:
+        Arrays of ROI for left and right eyes, array of midpoints and
+        array of landmarks points
+        """
+        left_eyes = []
+        right_eyes = []
+        midpoints = []
+        lmarks = []
+        surface = (0, 0, *frame_size)
+        for landm_face, rect in zip(in_landm_per_face, in_face_rcs):
+            points = process_landmarks(*rect, landm_face)
+            lmarks.extend(points)
+
+            rect, midpoint_l = eye_box(points[0], points[1])
+            left_eyes.append(intersection(surface, rect))
+
+            rect, midpoint_r = eye_box(points[2], points[3])
+            right_eyes.append(intersection(surface, rect))
+
+            midpoints.append(midpoint_l)
+            midpoints.append(midpoint_r)
+        return left_eyes, right_eyes, midpoints, lmarks
+
+
+@cv.gapi.kernel(GGetStates)
+class GGetStatesImpl:
+    """ Custom kernel. Get state of eye - open or closed
+    """
+    @staticmethod
+    def run(eyesl, eyesr):
+        """ Сustom kernel executable code
+
+        Params:
+        eyesl: result of inference of open-closed-eye network for left eye
+        eyesr: result of inference of open-closed-eye network for right eye
+
+        Return:
+        States of left eyes and states of right eyes
+        """
+        out_l_st = [int(st) for eye_l in eyesl for st in (eye_l[:, 0] < eye_l[:, 1]).ravel()]
+        out_r_st = [int(st) for eye_r in eyesr for st in (eye_r[:, 0] < eye_r[:, 1]).ravel()]
+        return out_l_st, out_r_st
+
+
+if __name__ == '__main__':
+    ARGUMENTS = build_argparser().parse_args()
+
+    # ------------------------Demo's graph------------------------
+    g_in = cv.GMat()
+
+    # Detect faces
+    face_inputs = cv.GInferInputs()
+    face_inputs.setInput('data', g_in)
+    face_outputs = cv.gapi.infer('face-detection', face_inputs)
+    faces = face_outputs.at('detection_out')
+
+    # Parse faces
+    sz = cv.gapi.streaming.size(g_in)
+    faces_rc = cv.gapi.parseSSD(faces, sz, 0.5, False, False)
+
+    # Detect poses
+    head_inputs = cv.GInferInputs()
+    head_inputs.setInput('data', g_in)
+    face_outputs = cv.gapi.infer('head-pose', faces_rc, head_inputs)
+    angles_y = face_outputs.at('angle_y_fc')
+    angles_p = face_outputs.at('angle_p_fc')
+    angles_r = face_outputs.at('angle_r_fc')
+
+    # Parse poses
+    heads_pos = GProcessPoses.on(angles_y, angles_p, angles_r)
+
+    # Detect landmarks
+    landmark_inputs = cv.GInferInputs()
+    landmark_inputs.setInput('data', g_in)
+    landmark_outputs = cv.gapi.infer('facial-landmarks', faces_rc,
+                                     landmark_inputs)
+    landmark = landmark_outputs.at('align_fc3')
+
+    # Parse landmarks
+    left_eyes, right_eyes, mids, lmarks = GParseEyes.on(landmark, faces_rc, sz)
+
+    # Detect eyes
+    eyes_inputs = cv.GInferInputs()
+    eyes_inputs.setInput('input.1', g_in)
+    eyesl_outputs = cv.gapi.infer('open-closed-eye', left_eyes, eyes_inputs)
+    eyesr_outputs = cv.gapi.infer('open-closed-eye', right_eyes, eyes_inputs)
+    eyesl = eyesl_outputs.at('19')
+    eyesr = eyesr_outputs.at('19')
+
+    # Process eyes states
+    l_eye_st, r_eye_st = GGetStates.on(eyesl, eyesr)
+
+    # Gaze estimation
+    gaze_inputs = cv.GInferListInputs()
+    gaze_inputs.setInput('left_eye_image', left_eyes)
+    gaze_inputs.setInput('right_eye_image', right_eyes)
+    gaze_inputs.setInput('head_pose_angles', heads_pos)
+    gaze_outputs = cv.gapi.infer2('gaze-estimation', g_in, gaze_inputs)
+    gaze_vectors = gaze_outputs.at('gaze_vector')
+
+    out = cv.gapi.copy(g_in)
+    # ------------------------End of graph------------------------
+
+    comp = cv.GComputation(cv.GIn(g_in), cv.GOut(out,
+                                                 faces_rc,
+                                                 left_eyes,
+                                                 right_eyes,
+                                                 gaze_vectors,
+                                                 angles_y,
+                                                 angles_p,
+                                                 angles_r,
+                                                 l_eye_st,
+                                                 r_eye_st,
+                                                 mids,
+                                                 lmarks))
+
+    # Networks
+    face_net = cv.gapi.ie.params('face-detection', ARGUMENTS.facem,
+                                 weight_path(ARGUMENTS.facem), ARGUMENTS.faced)
+    head_pose_net = cv.gapi.ie.params('head-pose', ARGUMENTS.headm,
+                                      weight_path(ARGUMENTS.headm), ARGUMENTS.headd)
+    landmarks_net = cv.gapi.ie.params('facial-landmarks', ARGUMENTS.landm,
+                                      weight_path(ARGUMENTS.landm), ARGUMENTS.landd)
+    gaze_net = cv.gapi.ie.params('gaze-estimation', ARGUMENTS.gazem,
+                                 weight_path(ARGUMENTS.gazem), ARGUMENTS.gazed)
+    eye_net = cv.gapi.ie.params('open-closed-eye', ARGUMENTS.eyem,
+                                weight_path(ARGUMENTS.eyem), ARGUMENTS.eyed)
+
+    nets = cv.gapi.networks(face_net, head_pose_net, landmarks_net, gaze_net, eye_net)
+
+    # Kernels pack
+    kernels = cv.gapi.kernels(GParseEyesImpl, GProcessPosesImpl, GGetStatesImpl)
+
+    # ------------------------Execution part------------------------
+    ccomp = comp.compileStreaming(args=cv.gapi.compile_args(kernels, nets))
+    source = cv.gapi.wip.make_capture_src(ARGUMENTS.input)
+    ccomp.setSource(cv.gin(source))
+    ccomp.start()
+
+    frames = 0
+    fps = 0
+    print('Processing')
+    START_TIME = time.time()
+
+    while True:
+        start_time_cycle = time.time()
+        has_frame, (oimg,
+                    outr,
+                    l_eyes,
+                    r_eyes,
+                    outg,
+                    out_y,
+                    out_p,
+                    out_r,
+                    out_st_l,
+                    out_st_r,
+                    out_mids,
+                    outl) = ccomp.pull()
+
+        if not has_frame:
+            break
+
+        # Draw
+        GREEN = (0, 255, 0)
+        RED = (0, 0, 255)
+        WHITE = (255, 255, 255)
+        BLUE = (255, 0, 0)
+        PINK = (255, 0, 255)
+        YELLOW = (0, 255, 255)
+
+        M_PI_180 = np.pi / 180
+        M_PI_2 = np.pi / 2
+        M_PI = np.pi
+
+        FACES_SIZE = len(outr)
+
+        for i, out_rect in enumerate(outr):
+            # Face box
+            cv.rectangle(oimg, out_rect, WHITE, 1)
+            rx, ry, rwidth, rheight = out_rect
+
+            # Landmarks
+            lm_radius = int(0.01 * rwidth + 1)
+            lmsize = int(len(outl) / FACES_SIZE)
+            for j in range(lmsize):
+                cv.circle(oimg, outl[j + i * lmsize], lm_radius, YELLOW, -1)
+
+            # Headposes
+            yaw = out_y[i]
+            pitch = out_p[i]
+            roll = out_r[i]
+            sin_y = np.sin(yaw[:] * M_PI_180)
+            sin_p = np.sin(pitch[:] * M_PI_180)
+            sin_r = np.sin(roll[:] * M_PI_180)
+
+            cos_y = np.cos(yaw[:] * M_PI_180)
+            cos_p = np.cos(pitch[:] * M_PI_180)
+            cos_r = np.cos(roll[:] * M_PI_180)
+
+            axis_length = 0.4 * rwidth
+            x_center = int(rx + rwidth / 2)
+            y_center = int(ry + rheight / 2)
+
+            # center to right
+            cv.line(oimg, [x_center, y_center],
+                    [int(x_center + axis_length * (cos_r * cos_y + sin_y * sin_p * sin_r)),
+                     int(y_center + axis_length * cos_p * sin_r)],
+                    RED, 2)
+
+            # center to top
+            cv.line(oimg, [x_center, y_center],
+                    [int(x_center + axis_length * (cos_r * sin_y * sin_p + cos_y * sin_r)),
+                     int(y_center - axis_length * cos_p * cos_r)],
+                    GREEN, 2)
+
+            # center to forward
+            cv.line(oimg, [x_center, y_center],
+                    [int(x_center + axis_length * sin_y * cos_p),
+                     int(y_center + axis_length * sin_p)],
+                    PINK, 2)
+
+            scale_box = 0.002 * rwidth
+            cv.putText(oimg, "head pose: (y=%0.0f, p=%0.0f, r=%0.0f)" %
+                       (np.round(yaw), np.round(pitch), np.round(roll)),
+                       [int(rx), int(ry + rheight + 5 * rwidth / 100)],
+                       cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1)
+
+            # Eyes boxes
+            color_l = GREEN if out_st_l[i] else RED
+            cv.rectangle(oimg, l_eyes[i], color_l, 1)
+            color_r = GREEN if out_st_r[i] else RED
+            cv.rectangle(oimg, r_eyes[i], color_r, 1)
+
+            # Gaze vectors
+            norm_gazes = np.linalg.norm(outg[i][0])
+            gaze_vector = outg[i][0] / norm_gazes
+
+            arrow_length = 0.4 * rwidth
+            gaze_arrow = [arrow_length * gaze_vector[0], -arrow_length * gaze_vector[1]]
+            left_arrow = [int(a+b) for a, b in zip(out_mids[0 + i * 2], gaze_arrow)]
+            right_arrow = [int(a+b) for a, b in zip(out_mids[1 + i * 2], gaze_arrow)]
+            if out_st_l[i]:
+                cv.arrowedLine(oimg, out_mids[0 + i * 2], left_arrow, BLUE, 2)
+            if out_st_r[i]:
+                cv.arrowedLine(oimg, out_mids[1 + i * 2], right_arrow, BLUE, 2)
+
+            v0, v1, v2 = outg[i][0]
+
+            gaze_angles = [180 / M_PI * (M_PI_2 + np.arctan2(v2, v0)),
+                           180 / M_PI * (M_PI_2 - np.arccos(v1 / norm_gazes))]
+            cv.putText(oimg, "gaze angles: (h=%0.0f, v=%0.0f)" %
+                       (np.round(gaze_angles[0]), np.round(gaze_angles[1])),
+                       [int(rx), int(ry + rheight + 12 * rwidth / 100)],
+                       cv.FONT_HERSHEY_PLAIN, scale_box * 2, WHITE, 1)
+
+        # Add FPS value to frame
+        cv.putText(oimg, "FPS: %0i" % (fps), [int(20), int(40)],
+                   cv.FONT_HERSHEY_PLAIN, 2, RED, 2)
+
+        # Show result
+        cv.imshow('Gaze Estimation', oimg)
+        cv.waitKey(1)
+
+        fps = int(1. / (time.time() - start_time_cycle))
+        frames += 1
+    EXECUTION_TIME = time.time() - START_TIME
+    print('Execution successful')
+    print('Mean FPS is ', int(frames / EXECUTION_TIME))
diff --git a/modules/gapi/misc/python/shadow_gapi.hpp b/modules/gapi/misc/python/shadow_gapi.hpp
index 40dab4158141..0b489dde0f55 100644
--- a/modules/gapi/misc/python/shadow_gapi.hpp
+++ b/modules/gapi/misc/python/shadow_gapi.hpp
@@ -3,64 +3,80 @@
 
 namespace cv
 {
-   struct GAPI_EXPORTS_W_SIMPLE GCompileArg { };
-
-   GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GKernelPackage pkg);
-   GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GNetPackage pkg);
-   GAPI_EXPORTS_W GCompileArgs compile_args(gapi::GKernelPackage kernels, gapi::GNetPackage nets);
+struct GAPI_EXPORTS_W_SIMPLE GCompileArg
+{
+    GAPI_WRAP GCompileArg(gapi::GKernelPackage pkg);
+    GAPI_WRAP GCompileArg(gapi::GNetPackage pkg);
+};
 
-   // NB: This classes doesn't exist in *.so
-   // HACK: Mark them as a class to force python wrapper generate code for this entities
-   class GAPI_EXPORTS_W_SIMPLE GProtoArg { };
-   class GAPI_EXPORTS_W_SIMPLE GProtoInputArgs { };
-   class GAPI_EXPORTS_W_SIMPLE GProtoOutputArgs { };
-   class GAPI_EXPORTS_W_SIMPLE GRunArg { };
-   class GAPI_EXPORTS_W_SIMPLE GMetaArg { GAPI_WRAP GMetaArg(); };
+class GAPI_EXPORTS_W_SIMPLE GInferInputs
+{
+public:
+    GAPI_WRAP GInferInputs();
+    GAPI_WRAP GInferInputs& setInput(const std::string& name, const cv::GMat&   value);
+    GAPI_WRAP GInferInputs& setInput(const std::string& name, const cv::GFrame& value);
+};
 
-   using GProtoInputArgs  = GIOProtoArgs<In_Tag>;
-   using GProtoOutputArgs = GIOProtoArgs<Out_Tag>;
+class GAPI_EXPORTS_W_SIMPLE GInferListInputs
+{
+public:
+    GAPI_WRAP GInferListInputs();
+    GAPI_WRAP GInferListInputs setInput(const std::string& name, const cv::GArray<cv::GMat>& value);
+    GAPI_WRAP GInferListInputs setInput(const std::string& name, const cv::GArray<cv::Rect>& value);
+};
 
-   class GAPI_EXPORTS_W_SIMPLE GInferInputs
-   {
-   public:
-       GAPI_WRAP GInferInputs();
-       GAPI_WRAP void setInput(const std::string& name, const cv::GMat&   value);
-       GAPI_WRAP void setInput(const std::string& name, const cv::GFrame& value);
-   };
+class GAPI_EXPORTS_W_SIMPLE GInferOutputs
+{
+public:
+    GAPI_WRAP GInferOutputs();
+    GAPI_WRAP cv::GMat at(const std::string& name);
+};
 
-   class GAPI_EXPORTS_W_SIMPLE GInferListInputs
-   {
-   public:
-       GAPI_WRAP GInferListInputs();
-       GAPI_WRAP void setInput(const std::string& name, const cv::GArray<cv::GMat>& value);
-       GAPI_WRAP void setInput(const std::string& name, const cv::GArray<cv::Rect>& value);
-   };
+class GAPI_EXPORTS_W_SIMPLE GInferListOutputs
+{
+public:
+    GAPI_WRAP GInferListOutputs();
+    GAPI_WRAP cv::GArray<cv::GMat> at(const std::string& name);
+};
 
-   class GAPI_EXPORTS_W_SIMPLE GInferOutputs
-   {
-   public:
-       GAPI_WRAP GInferOutputs();
-       GAPI_WRAP cv::GMat at(const std::string& name);
-   };
+namespace gapi
+{
+namespace wip
+{
+class GAPI_EXPORTS_W IStreamSource { };
+namespace draw
+{
+    // NB: These render primitives are partially wrapped in shadow file
+    // because cv::Rect conflicts with cv::gapi::wip::draw::Rect in python generator
+    // and cv::Rect2i breaks standalone mode.
+    struct Rect
+    {
+        GAPI_WRAP Rect(const cv::Rect2i& rect_,
+                       const cv::Scalar& color_,
+                       int thick_ = 1,
+                       int lt_ = 8,
+                       int shift_ = 0);
+    };
 
-   class GAPI_EXPORTS_W_SIMPLE GInferListOutputs
-   {
-   public:
-       GAPI_WRAP GInferListOutputs();
-       GAPI_WRAP cv::GArray<cv::GMat> at(const std::string& name);
-   };
+    struct Mosaic
+    {
+        GAPI_WRAP Mosaic(const cv::Rect2i& mos_, int cellSz_, int decim_);
+    };
+} // namespace draw
+} // namespace wip
+namespace streaming
+{
+    // FIXME: Extend to work with an arbitrary G-type.
+    cv::GOpaque<int64_t> GAPI_EXPORTS_W timestamp(cv::GMat);
+    cv::GOpaque<int64_t> GAPI_EXPORTS_W seqNo(cv::GMat);
+    cv::GOpaque<int64_t> GAPI_EXPORTS_W seq_id(cv::GMat);
 
-   namespace detail
-   {
-       struct GAPI_EXPORTS_W_SIMPLE ExtractArgsCallback { };
-       struct GAPI_EXPORTS_W_SIMPLE ExtractMetaCallback { };
-   } // namespace detail
+    GAPI_EXPORTS_W cv::GMat desync(const cv::GMat &g);
+} // namespace streaming
+} // namespace gapi
 
-   namespace gapi
-   {
-       namespace wip
-       {
-           class GAPI_EXPORTS_W IStreamSource { };
-       } // namespace wip
-   } // namespace gapi
+namespace detail
+{
+    gapi::GNetParam GAPI_EXPORTS_W strip(gapi::ie::PyParams params);
+} // namespace detail
 } // namespace cv
diff --git a/modules/gapi/misc/python/test/test_gapi_core.py b/modules/gapi/misc/python/test/test_gapi_core.py
index 814d05d7cde4..780558d98b1a 100644
--- a/modules/gapi/misc/python/test/test_gapi_core.py
+++ b/modules/gapi/misc/python/test/test_gapi_core.py
@@ -3,187 +3,209 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
 
 from tests_common import NewOpenCVTests
 
 
-# Plaidml is an optional backend
-pkgs = [
-          ('ocl'    , cv.gapi.core.ocl.kernels()),
-          ('cpu'    , cv.gapi.core.cpu.kernels()),
-          ('fluid'  , cv.gapi.core.fluid.kernels())
-          # ('plaidml', cv.gapi.core.plaidml.kernels())
-       ]
+try:
 
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
-class gapi_core_test(NewOpenCVTests):
+    # Plaidml is an optional backend
+    pkgs = [
+              ('ocl'    , cv.gapi.core.ocl.kernels()),
+              ('cpu'    , cv.gapi.core.cpu.kernels()),
+              ('fluid'  , cv.gapi.core.fluid.kernels())
+              # ('plaidml', cv.gapi.core.plaidml.kernels())
+           ]
 
-    def test_add(self):
-        # TODO: Extend to use any type and size here
-        sz = (720, 1280)
-        in1 = np.full(sz, 100)
-        in2 = np.full(sz, 50)
 
-        # OpenCV
-        expected = cv.add(in1, in2)
+    class gapi_core_test(NewOpenCVTests):
 
-        # G-API
-        g_in1 = cv.GMat()
-        g_in2 = cv.GMat()
-        g_out = cv.gapi.add(g_in1, g_in2)
-        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+        def test_add(self):
+            # TODO: Extend to use any type and size here
+            sz = (720, 1280)
+            in1 = np.full(sz, 100)
+            in2 = np.full(sz, 50)
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in1, in2), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
-            self.assertEqual(expected.dtype, actual.dtype, 'Failed on ' + pkg_name + ' backend')
+            # OpenCV
+            expected = cv.add(in1, in2)
 
+            # G-API
+            g_in1 = cv.GMat()
+            g_in2 = cv.GMat()
+            g_out = cv.gapi.add(g_in1, g_in2)
+            comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in1, in2), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
+                self.assertEqual(expected.dtype, actual.dtype, 'Failed on ' + pkg_name + ' backend')
+
+
+        def test_add_uint8(self):
+            sz = (720, 1280)
+            in1 = np.full(sz, 100, dtype=np.uint8)
+            in2 = np.full(sz, 50 , dtype=np.uint8)
+
+            # OpenCV
+            expected = cv.add(in1, in2)
+
+            # G-API
+            g_in1 = cv.GMat()
+            g_in2 = cv.GMat()
+            g_out = cv.gapi.add(g_in1, g_in2)
+            comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in1, in2), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
+                self.assertEqual(expected.dtype, actual.dtype, 'Failed on ' + pkg_name + ' backend')
 
-    def test_add_uint8(self):
-        sz = (720, 1280)
-        in1 = np.full(sz, 100, dtype=np.uint8)
-        in2 = np.full(sz, 50 , dtype=np.uint8)
 
-        # OpenCV
-        expected = cv.add(in1, in2)
+        def test_mean(self):
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_mat = cv.imread(img_path)
 
-        # G-API
-        g_in1 = cv.GMat()
-        g_in2 = cv.GMat()
-        g_out = cv.gapi.add(g_in1, g_in2)
-        comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
+            # OpenCV
+            expected = cv.mean(in_mat)
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in1, in2), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
-            self.assertEqual(expected.dtype, actual.dtype, 'Failed on ' + pkg_name + ' backend')
+            # G-API
+            g_in = cv.GMat()
+            g_out = cv.gapi.mean(g_in)
+            comp = cv.GComputation(g_in, g_out)
+
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
 
 
-    def test_mean(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.imread(img_path)
+        def test_split3(self):
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_mat = cv.imread(img_path)
 
-        # OpenCV
-        expected = cv.mean(in_mat)
+            # OpenCV
+            expected = cv.split(in_mat)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.mean(g_in)
-        comp = cv.GComputation(g_in, g_out)
+            # G-API
+            g_in = cv.GMat()
+            b, g, r = cv.gapi.split3(g_in)
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(b, g, r))
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                for e, a in zip(expected, actual):
+                    self.assertEqual(0.0, cv.norm(e, a, cv.NORM_INF),
+                                     'Failed on ' + pkg_name + ' backend')
+                    self.assertEqual(e.dtype, a.dtype, 'Failed on ' + pkg_name + ' backend')
 
 
-    def test_split3(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.imread(img_path)
+        def test_threshold(self):
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
+            maxv = (30, 30)
 
-        # OpenCV
-        expected = cv.split(in_mat)
+            # OpenCV
+            expected_thresh, expected_mat = cv.threshold(in_mat, maxv[0], maxv[0], cv.THRESH_TRIANGLE)
 
-        # G-API
-        g_in = cv.GMat()
-        b, g, r = cv.gapi.split3(g_in)
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(b, g, r))
+            # G-API
+            g_in = cv.GMat()
+            g_sc = cv.GScalar()
+            mat, threshold = cv.gapi.threshold(g_in, g_sc, cv.THRESH_TRIANGLE)
+            comp = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(mat, threshold))
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
-            # Comparison
-            for e, a in zip(expected, actual):
-                self.assertEqual(0.0, cv.norm(e, a, cv.NORM_INF),
+            for pkg_name, pkg in pkgs:
+                actual_mat, actual_thresh = comp.apply(cv.gin(in_mat, maxv), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected_mat, actual_mat, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
+                self.assertEqual(expected_mat.dtype, actual_mat.dtype,
+                                 'Failed on ' + pkg_name + ' backend')
+                self.assertEqual(expected_thresh, actual_thresh[0],
                                  'Failed on ' + pkg_name + ' backend')
-                self.assertEqual(e.dtype, a.dtype, 'Failed on ' + pkg_name + ' backend')
-
-
-    def test_threshold(self):
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in_mat = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
-        maxv = (30, 30)
-
-        # OpenCV
-        expected_thresh, expected_mat = cv.threshold(in_mat, maxv[0], maxv[0], cv.THRESH_TRIANGLE)
-
-        # G-API
-        g_in = cv.GMat()
-        g_sc = cv.GScalar()
-        mat, threshold = cv.gapi.threshold(g_in, g_sc, cv.THRESH_TRIANGLE)
-        comp = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(mat, threshold))
-
-        for pkg_name, pkg in pkgs:
-            actual_mat, actual_thresh = comp.apply(cv.gin(in_mat, maxv), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected_mat, actual_mat, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
-            self.assertEqual(expected_mat.dtype, actual_mat.dtype,
-                             'Failed on ' + pkg_name + ' backend')
-            self.assertEqual(expected_thresh, actual_thresh[0],
-                             'Failed on ' + pkg_name + ' backend')
-
-    def test_kmeans(self):
-        # K-means params
-        count    = 100
-        sz       = (count, 2)
-        in_mat   = np.random.random(sz).astype(np.float32)
-        K        = 5
-        flags    = cv.KMEANS_RANDOM_CENTERS
-        attempts = 1;
-        criteria = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0)
-
-        # G-API
-        g_in = cv.GMat()
-        compactness, out_labels, centers = cv.gapi.kmeans(g_in, K, criteria, attempts, flags)
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(compactness, out_labels, centers))
-
-        compact, labels, centers = comp.apply(cv.gin(in_mat))
-
-        # Assert
-        self.assertTrue(compact >= 0)
-        self.assertEqual(sz[0], labels.shape[0])
-        self.assertEqual(1, labels.shape[1])
-        self.assertTrue(labels.size != 0)
-        self.assertEqual(centers.shape[1], sz[1]);
-        self.assertEqual(centers.shape[0], K);
-        self.assertTrue(centers.size != 0);
-
-
-    def generate_random_points(self, sz):
-        arr = np.random.random(sz).astype(np.float32).T
-        return list(zip(arr[0], arr[1]))
-
-
-    def test_kmeans_2d(self):
-        # K-means 2D params
-        count     = 100
-        sz        = (count, 2)
-        amount    = sz[0]
-        K         = 5
-        flags     = cv.KMEANS_RANDOM_CENTERS
-        attempts  = 1;
-        criteria  = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0);
-        in_vector = self.generate_random_points(sz)
-        in_labels = []
-
-        # G-API
-        data        = cv.GArrayT(cv.gapi.CV_POINT2F)
-        best_labels = cv.GArrayT(cv.gapi.CV_INT)
-
-        compactness, out_labels, centers = cv.gapi.kmeans(data, K, best_labels, criteria, attempts, flags);
-        comp = cv.GComputation(cv.GIn(data, best_labels), cv.GOut(compactness, out_labels, centers));
-
-        compact, labels, centers = comp.apply(cv.gin(in_vector, in_labels));
-
-        # Assert
-        self.assertTrue(compact >= 0)
-        self.assertEqual(amount, len(labels))
-        self.assertEqual(K, len(centers))
+
+
+        def test_kmeans(self):
+            # K-means params
+            count    = 100
+            sz       = (count, 2)
+            in_mat   = np.random.random(sz).astype(np.float32)
+            K        = 5
+            flags    = cv.KMEANS_RANDOM_CENTERS
+            attempts = 1
+            criteria = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0)
+
+            # G-API
+            g_in = cv.GMat()
+            compactness, out_labels, centers = cv.gapi.kmeans(g_in, K, criteria, attempts, flags)
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(compactness, out_labels, centers))
+
+            compact, labels, centers = comp.apply(cv.gin(in_mat))
+
+            # Assert
+            self.assertTrue(compact >= 0)
+            self.assertEqual(sz[0], labels.shape[0])
+            self.assertEqual(1, labels.shape[1])
+            self.assertTrue(labels.size != 0)
+            self.assertEqual(centers.shape[1], sz[1])
+            self.assertEqual(centers.shape[0], K)
+            self.assertTrue(centers.size != 0)
+
+
+        def generate_random_points(self, sz):
+            arr = np.random.random(sz).astype(np.float32).T
+            return list(zip(arr[0], arr[1]))
+
+
+        def test_kmeans_2d(self):
+            # K-means 2D params
+            count     = 100
+            sz        = (count, 2)
+            amount    = sz[0]
+            K         = 5
+            flags     = cv.KMEANS_RANDOM_CENTERS
+            attempts  = 1
+            criteria  = (cv.TERM_CRITERIA_MAX_ITER + cv.TERM_CRITERIA_EPS, 30, 0)
+            in_vector = self.generate_random_points(sz)
+            in_labels = []
+
+            # G-API
+            data        = cv.GArrayT(cv.gapi.CV_POINT2F)
+            best_labels = cv.GArrayT(cv.gapi.CV_INT)
+
+            compactness, out_labels, centers = cv.gapi.kmeans(data, K, best_labels, criteria, attempts, flags)
+            comp = cv.GComputation(cv.GIn(data, best_labels), cv.GOut(compactness, out_labels, centers))
+
+            compact, labels, centers = comp.apply(cv.gin(in_vector, in_labels))
+
+            # Assert
+            self.assertTrue(compact >= 0)
+            self.assertEqual(amount, len(labels))
+            self.assertEqual(K, len(centers))
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+    pass
 
 
 if __name__ == '__main__':
diff --git a/modules/gapi/misc/python/test/test_gapi_imgproc.py b/modules/gapi/misc/python/test/test_gapi_imgproc.py
index ed6f883fe55f..365a5a8cca74 100644
--- a/modules/gapi/misc/python/test/test_gapi_imgproc.py
+++ b/modules/gapi/misc/python/test/test_gapi_imgproc.py
@@ -3,103 +3,124 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
 
 from tests_common import NewOpenCVTests
 
 
-# Plaidml is an optional backend
-pkgs = [
-           ('ocl'    , cv.gapi.core.ocl.kernels()),
-           ('cpu'    , cv.gapi.core.cpu.kernels()),
-           ('fluid'  , cv.gapi.core.fluid.kernels())
-           # ('plaidml', cv.gapi.core.plaidml.kernels())
-       ]
+try:
 
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
-class gapi_imgproc_test(NewOpenCVTests):
+    # Plaidml is an optional backend
+    pkgs = [
+               ('ocl'    , cv.gapi.core.ocl.kernels()),
+               ('cpu'    , cv.gapi.core.cpu.kernels()),
+               ('fluid'  , cv.gapi.core.fluid.kernels())
+               # ('plaidml', cv.gapi.core.plaidml.kernels())
+           ]
 
-    def test_good_features_to_track(self):
-        # TODO: Extend to use any type and size here
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in1 = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
 
-        # NB: goodFeaturesToTrack configuration
-        max_corners         = 50
-        quality_lvl         = 0.01
-        min_distance        = 10
-        block_sz            = 3
-        use_harris_detector = True
-        k                   = 0.04
-        mask                = None
+    class gapi_imgproc_test(NewOpenCVTests):
 
-        # OpenCV
-        expected = cv.goodFeaturesToTrack(in1, max_corners, quality_lvl,
-                                          min_distance, mask=mask,
-                                          blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
+        def test_good_features_to_track(self):
+            # TODO: Extend to use any type and size here
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in1 = cv.cvtColor(cv.imread(img_path), cv.COLOR_RGB2GRAY)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.goodFeaturesToTrack(g_in, max_corners, quality_lvl,
-                                            min_distance, mask, block_sz, use_harris_detector, k)
+            # NB: goodFeaturesToTrack configuration
+            max_corners         = 50
+            quality_lvl         = 0.01
+            min_distance        = 10
+            block_sz            = 3
+            use_harris_detector = True
+            k                   = 0.04
+            mask                = None
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+            # OpenCV
+            expected = cv.goodFeaturesToTrack(in1, max_corners, quality_lvl,
+                                              min_distance, mask=mask,
+                                              blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in1), args=cv.compile_args(pkg))
-            # NB: OpenCV & G-API have different output shapes:
-            # OpenCV - (num_points, 1, 2)
-            # G-API  - (num_points, 2)
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected.flatten(),
-                                          np.array(actual, dtype=np.float32).flatten(),
-                                          cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
+            # G-API
+            g_in = cv.GMat()
+            g_out = cv.gapi.goodFeaturesToTrack(g_in, max_corners, quality_lvl,
+                                                min_distance, mask, block_sz, use_harris_detector, k)
 
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
 
-    def test_rgb2gray(self):
-        # TODO: Extend to use any type and size here
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        in1 = cv.imread(img_path)
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in1), args=cv.gapi.compile_args(pkg))
+                # NB: OpenCV & G-API have different output shapes:
+                # OpenCV - (num_points, 1, 2)
+                # G-API  - (num_points, 2)
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected.flatten(),
+                                              np.array(actual, dtype=np.float32).flatten(),
+                                              cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
 
-        # OpenCV
-        expected = cv.cvtColor(in1, cv.COLOR_RGB2GRAY)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.RGB2Gray(g_in)
+        def test_rgb2gray(self):
+            # TODO: Extend to use any type and size here
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            in1 = cv.imread(img_path)
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+            # OpenCV
+            expected = cv.cvtColor(in1, cv.COLOR_RGB2GRAY)
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(in1), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
+            # G-API
+            g_in = cv.GMat()
+            g_out = cv.gapi.RGB2Gray(g_in)
 
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
 
-    def test_bounding_rect(self):
-        sz = 1280
-        fscale = 256
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(in1), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
 
-        def sample_value(fscale):
-            return np.random.uniform(0, 255 * fscale) / fscale
 
-        points = np.array([(sample_value(fscale), sample_value(fscale)) for _ in range(1280)], np.float32)
+        def test_bounding_rect(self):
+            sz = 1280
+            fscale = 256
 
-        # OpenCV
-        expected = cv.boundingRect(points)
+            def sample_value(fscale):
+                return np.random.uniform(0, 255 * fscale) / fscale
 
-        # G-API
-        g_in  = cv.GMat()
-        g_out = cv.gapi.boundingRect(g_in)
+            points = np.array([(sample_value(fscale), sample_value(fscale)) for _ in range(1280)], np.float32)
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+            # OpenCV
+            expected = cv.boundingRect(points)
 
-        for pkg_name, pkg in pkgs:
-            actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
-            # Comparison
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
-                             'Failed on ' + pkg_name + ' backend')
+            # G-API
+            g_in  = cv.GMat()
+            g_out = cv.gapi.boundingRect(g_in)
+
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+
+            for pkg_name, pkg in pkgs:
+                actual = comp.apply(cv.gin(points), args=cv.gapi.compile_args(pkg))
+                # Comparison
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF),
+                                 'Failed on ' + pkg_name + ' backend')
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+    pass
 
 
 if __name__ == '__main__':
diff --git a/modules/gapi/misc/python/test/test_gapi_infer.py b/modules/gapi/misc/python/test/test_gapi_infer.py
index db048f57866c..8ecc957e416d 100644
--- a/modules/gapi/misc/python/test/test_gapi_infer.py
+++ b/modules/gapi/misc/python/test/test_gapi_infer.py
@@ -3,318 +3,338 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
 
 from tests_common import NewOpenCVTests
 
 
-class test_gapi_infer(NewOpenCVTests):
+try:
 
-    def infer_reference_network(self, model_path, weights_path, img):
-        net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
-        net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
-        net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
-        blob = cv.dnn.blobFromImage(img)
 
-        net.setInput(blob)
-        return net.forward(net.getUnconnectedOutLayersNames())
+    class test_gapi_infer(NewOpenCVTests):
 
+        def infer_reference_network(self, model_path, weights_path, img):
+            net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
+            net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+            net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
 
-    def make_roi(self, img, roi):
-        return img[roi[1]:roi[1] + roi[3], roi[0]:roi[0] + roi[2], ...]
+            blob = cv.dnn.blobFromImage(img)
 
+            net.setInput(blob)
+            return net.forward(net.getUnconnectedOutLayersNames())
 
-    def test_age_gender_infer(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
 
-        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        device_id    = 'CPU'
+        def make_roi(self, img, roi):
+            return img[roi[1]:roi[1] + roi[3], roi[0]:roi[0] + roi[2], ...]
 
-        img_path  = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        img       = cv.resize(cv.imread(img_path), (62,62))
 
-        # OpenCV DNN
-        dnn_age, dnn_gender = self.infer_reference_network(model_path, weights_path, img)
+        def test_age_gender_infer(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
 
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        inputs = cv.GInferInputs()
-        inputs.setInput('data', g_in)
+            root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            device_id    = 'CPU'
 
-        outputs  = cv.gapi.infer("net", inputs)
-        age_g    = outputs.at("age_conv3")
-        gender_g = outputs.at("prob")
+            img_path  = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            img       = cv.resize(cv.imread(img_path), (62,62))
 
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(age_g, gender_g))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+            # OpenCV DNN
+            dnn_age, dnn_gender = self.infer_reference_network(model_path, weights_path, img)
 
-        gapi_age, gapi_gender = comp.apply(cv.gin(img), args=cv.compile_args(cv.gapi.networks(pp)))
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            inputs = cv.GInferInputs()
+            inputs.setInput('data', g_in)
 
-        # Check
-        self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
-        self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+            outputs  = cv.gapi.infer("net", inputs)
+            age_g    = outputs.at("age_conv3")
+            gender_g = outputs.at("prob")
 
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(age_g, gender_g))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
 
-    def test_age_gender_infer_roi(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
+            gapi_age, gapi_gender = comp.apply(cv.gin(img), args=cv.gapi.compile_args(cv.gapi.networks(pp)))
 
-        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        device_id    = 'CPU'
+            # Check
+            self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
 
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        img = cv.imread(img_path)
-        roi      = (10, 10, 62, 62)
 
-        # OpenCV DNN
-        dnn_age, dnn_gender = self.infer_reference_network(model_path,
-                                                           weights_path,
-                                                           self.make_roi(img, roi))
+        def test_age_gender_infer_roi(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
 
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        g_roi  = cv.GOpaqueT(cv.gapi.CV_RECT)
-        inputs = cv.GInferInputs()
-        inputs.setInput('data', g_in)
-
-        outputs  = cv.gapi.infer("net", g_roi, inputs)
-        age_g    = outputs.at("age_conv3")
-        gender_g = outputs.at("prob")
-
-        comp = cv.GComputation(cv.GIn(g_in, g_roi), cv.GOut(age_g, gender_g))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
-
-        gapi_age, gapi_gender = comp.apply(cv.gin(img, roi), args=cv.compile_args(cv.gapi.networks(pp)))
-
-        # Check
-        self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
-        self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
-
-
-    def test_age_gender_infer_roi_list(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
-
-        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        device_id    = 'CPU'
-
-        rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        img = cv.imread(img_path)
-
-        # OpenCV DNN
-        dnn_age_list    = []
-        dnn_gender_list = []
-        for roi in rois:
-            age, gender = self.infer_reference_network(model_path,
-                                                       weights_path,
-                                                       self.make_roi(img, roi))
-            dnn_age_list.append(age)
-            dnn_gender_list.append(gender)
-
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        g_rois = cv.GArrayT(cv.gapi.CV_RECT)
-        inputs = cv.GInferInputs()
-        inputs.setInput('data', g_in)
-
-        outputs  = cv.gapi.infer("net", g_rois, inputs)
-        age_g    = outputs.at("age_conv3")
-        gender_g = outputs.at("prob")
-
-        comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
-
-        gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
-                                                     args=cv.compile_args(cv.gapi.networks(pp)))
-
-        # Check
-        for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
-                                                              gapi_gender_list,
-                                                              dnn_age_list,
-                                                              dnn_gender_list):
-            self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
-            self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+            root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            device_id    = 'CPU'
+
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            img = cv.imread(img_path)
+            roi      = (10, 10, 62, 62)
+
+            # OpenCV DNN
+            dnn_age, dnn_gender = self.infer_reference_network(model_path,
+                                                               weights_path,
+                                                               self.make_roi(img, roi))
+
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            g_roi  = cv.GOpaqueT(cv.gapi.CV_RECT)
+            inputs = cv.GInferInputs()
+            inputs.setInput('data', g_in)
+
+            outputs  = cv.gapi.infer("net", g_roi, inputs)
+            age_g    = outputs.at("age_conv3")
+            gender_g = outputs.at("prob")
+
+            comp = cv.GComputation(cv.GIn(g_in, g_roi), cv.GOut(age_g, gender_g))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
 
+            gapi_age, gapi_gender = comp.apply(cv.gin(img, roi), args=cv.gapi.compile_args(cv.gapi.networks(pp)))
 
-    def test_age_gender_infer2_roi(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
-
-        root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        device_id    = 'CPU'
-
-        rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
-        img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        img = cv.imread(img_path)
-
-        # OpenCV DNN
-        dnn_age_list    = []
-        dnn_gender_list = []
-        for roi in rois:
-            age, gender = self.infer_reference_network(model_path,
-                                                       weights_path,
-                                                       self.make_roi(img, roi))
-            dnn_age_list.append(age)
-            dnn_gender_list.append(gender)
-
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        g_rois = cv.GArrayT(cv.gapi.CV_RECT)
-        inputs = cv.GInferListInputs()
-        inputs.setInput('data', g_rois)
-
-        outputs  = cv.gapi.infer2("net", g_in, inputs)
-        age_g    = outputs.at("age_conv3")
-        gender_g = outputs.at("prob")
-
-        comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
-
-        gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
-                                                     args=cv.compile_args(cv.gapi.networks(pp)))
-
-        # Check
-        for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
-                                                              gapi_gender_list,
-                                                              dnn_age_list,
-                                                              dnn_gender_list):
+            # Check
             self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
             self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
 
 
+        def test_age_gender_infer_roi_list(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
+
+            root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            device_id    = 'CPU'
+
+            rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            img = cv.imread(img_path)
+
+            # OpenCV DNN
+            dnn_age_list    = []
+            dnn_gender_list = []
+            for roi in rois:
+                age, gender = self.infer_reference_network(model_path,
+                                                           weights_path,
+                                                           self.make_roi(img, roi))
+                dnn_age_list.append(age)
+                dnn_gender_list.append(gender)
+
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            g_rois = cv.GArrayT(cv.gapi.CV_RECT)
+            inputs = cv.GInferInputs()
+            inputs.setInput('data', g_in)
+
+            outputs  = cv.gapi.infer("net", g_rois, inputs)
+            age_g    = outputs.at("age_conv3")
+            gender_g = outputs.at("prob")
+
+            comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+            gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
+                                                         args=cv.gapi.compile_args(cv.gapi.networks(pp)))
+
+            # Check
+            for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
+                                                                  gapi_gender_list,
+                                                                  dnn_age_list,
+                                                                  dnn_gender_list):
+                self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+                self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+
+
+        def test_age_gender_infer2_roi(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
+
+            root_path    = '/omz_intel_models/intel/age-gender-recognition-retail-0013/FP32/age-gender-recognition-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            device_id    = 'CPU'
+
+            rois = [(10, 15, 62, 62), (23, 50, 62, 62), (14, 100, 62, 62), (80, 50, 62, 62)]
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            img = cv.imread(img_path)
+
+            # OpenCV DNN
+            dnn_age_list    = []
+            dnn_gender_list = []
+            for roi in rois:
+                age, gender = self.infer_reference_network(model_path,
+                                                           weights_path,
+                                                           self.make_roi(img, roi))
+                dnn_age_list.append(age)
+                dnn_gender_list.append(gender)
+
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            g_rois = cv.GArrayT(cv.gapi.CV_RECT)
+            inputs = cv.GInferListInputs()
+            inputs.setInput('data', g_rois)
+
+            outputs  = cv.gapi.infer2("net", g_in, inputs)
+            age_g    = outputs.at("age_conv3")
+            gender_g = outputs.at("prob")
+
+            comp = cv.GComputation(cv.GIn(g_in, g_rois), cv.GOut(age_g, gender_g))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+            gapi_age_list, gapi_gender_list = comp.apply(cv.gin(img, rois),
+                                                         args=cv.gapi.compile_args(cv.gapi.networks(pp)))
+
+            # Check
+            for gapi_age, gapi_gender, dnn_age, dnn_gender in zip(gapi_age_list,
+                                                                  gapi_gender_list,
+                                                                  dnn_age_list,
+                                                                  dnn_gender_list):
+                self.assertEqual(0.0, cv.norm(dnn_gender, gapi_gender, cv.NORM_INF))
+                self.assertEqual(0.0, cv.norm(dnn_age, gapi_age, cv.NORM_INF))
+
+
+
+        def test_person_detection_retail_0013(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
+
+            root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            device_id    = 'CPU'
+            img          = cv.resize(cv.imread(img_path), (544, 320))
+
+            # OpenCV DNN
+            net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
+            net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+            net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+
+            blob = cv.dnn.blobFromImage(img)
+
+            def parseSSD(detections, size):
+                h, w = size
+                bboxes = []
+                detections = detections.reshape(-1, 7)
+                for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
+                    if confidence >= 0.5:
+                        x      = int(xmin * w)
+                        y      = int(ymin * h)
+                        width  = int(xmax * w - x)
+                        height = int(ymax * h - y)
+                        bboxes.append((x, y, width, height))
+
+                return bboxes
+
+            net.setInput(blob)
+            dnn_detections = net.forward()
+            dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
+
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            inputs = cv.GInferInputs()
+            inputs.setInput('data', g_in)
+
+            g_sz       = cv.gapi.streaming.size(g_in)
+            outputs    = cv.gapi.infer("net", inputs)
+            detections = outputs.at("detection_out")
+            bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
+
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+            gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
+                                    args=cv.gapi.compile_args(cv.gapi.networks(pp)))
+
+            # Comparison
+            self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
+                                          np.array(gapi_boxes).flatten(),
+                                          cv.NORM_INF))
+
+
+        def test_person_detection_retail_0013(self):
+            # NB: Check IE
+            if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
+                return
+
+            root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
+            model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
+            img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            device_id    = 'CPU'
+            img          = cv.resize(cv.imread(img_path), (544, 320))
+
+            # OpenCV DNN
+            net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
+            net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
+            net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
+
+            blob = cv.dnn.blobFromImage(img)
+
+            def parseSSD(detections, size):
+                h, w = size
+                bboxes = []
+                detections = detections.reshape(-1, 7)
+                for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
+                    if confidence >= 0.5:
+                        x      = int(xmin * w)
+                        y      = int(ymin * h)
+                        width  = int(xmax * w - x)
+                        height = int(ymax * h - y)
+                        bboxes.append((x, y, width, height))
+
+                return bboxes
+
+            net.setInput(blob)
+            dnn_detections = net.forward()
+            dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
+
+            # OpenCV G-API
+            g_in   = cv.GMat()
+            inputs = cv.GInferInputs()
+            inputs.setInput('data', g_in)
+
+            g_sz       = cv.gapi.streaming.size(g_in)
+            outputs    = cv.gapi.infer("net", inputs)
+            detections = outputs.at("detection_out")
+            bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
+
+            comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
+            pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
+
+            gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
+                                    args=cv.gapi.compile_args(cv.gapi.networks(pp)))
+
+            # Comparison
+            self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
+                                          np.array(gapi_boxes).flatten(),
+                                          cv.NORM_INF))
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
 
-    def test_person_detection_retail_0013(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
-
-        root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        device_id    = 'CPU'
-        img          = cv.resize(cv.imread(img_path), (544, 320))
-
-        # OpenCV DNN
-        net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
-        net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
-        net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
-
-        blob = cv.dnn.blobFromImage(img)
-
-        def parseSSD(detections, size):
-            h, w = size
-            bboxes = []
-            detections = detections.reshape(-1, 7)
-            for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
-                if confidence >= 0.5:
-                    x      = int(xmin * w)
-                    y      = int(ymin * h)
-                    width  = int(xmax * w - x)
-                    height = int(ymax * h - y)
-                    bboxes.append((x, y, width, height))
-
-            return bboxes
-
-        net.setInput(blob)
-        dnn_detections = net.forward()
-        dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
-
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        inputs = cv.GInferInputs()
-        inputs.setInput('data', g_in)
-
-        g_sz       = cv.gapi.streaming.size(g_in)
-        outputs    = cv.gapi.infer("net", inputs)
-        detections = outputs.at("detection_out")
-        bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
-
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
-
-        gapi_age, gapi_gender = comp.apply(cv.gin(img), args=cv.compile_args(cv.gapi.networks(pp)))
-
-        gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
-                                args=cv.compile_args(cv.gapi.networks(pp)))
-
-        # Comparison
-        self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
-                                      np.array(gapi_boxes).flatten(),
-                                      cv.NORM_INF))
-
-
-    def test_person_detection_retail_0013(self):
-        # NB: Check IE
-        if not cv.dnn.DNN_TARGET_CPU in cv.dnn.getAvailableTargets(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE):
-            return
-
-        root_path    = '/omz_intel_models/intel/person-detection-retail-0013/FP32/person-detection-retail-0013'
-        model_path   = self.find_file(root_path + '.xml',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        weights_path = self.find_file(root_path + '.bin',   [os.environ.get('OPENCV_DNN_TEST_DATA_PATH')])
-        img_path     = self.find_file('gpu/lbpcascade/er.png', [os.environ.get('OPENCV_TEST_DATA_PATH')])
-        device_id    = 'CPU'
-        img          = cv.resize(cv.imread(img_path), (544, 320))
-
-        # OpenCV DNN
-        net = cv.dnn.readNetFromModelOptimizer(model_path, weights_path)
-        net.setPreferableBackend(cv.dnn.DNN_BACKEND_INFERENCE_ENGINE)
-        net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)
-
-        blob = cv.dnn.blobFromImage(img)
-
-        def parseSSD(detections, size):
-            h, w = size
-            bboxes = []
-            detections = detections.reshape(-1, 7)
-            for sample_id, class_id, confidence, xmin, ymin, xmax, ymax in detections:
-                if confidence >= 0.5:
-                    x      = int(xmin * w)
-                    y      = int(ymin * h)
-                    width  = int(xmax * w - x)
-                    height = int(ymax * h - y)
-                    bboxes.append((x, y, width, height))
-
-            return bboxes
-
-        net.setInput(blob)
-        dnn_detections = net.forward()
-        dnn_boxes = parseSSD(np.array(dnn_detections), img.shape[:2])
-
-        # OpenCV G-API
-        g_in   = cv.GMat()
-        inputs = cv.GInferInputs()
-        inputs.setInput('data', g_in)
-
-        g_sz       = cv.gapi.streaming.size(g_in)
-        outputs    = cv.gapi.infer("net", inputs)
-        detections = outputs.at("detection_out")
-        bboxes     = cv.gapi.parseSSD(detections, g_sz, 0.5, False, False)
-
-        comp = cv.GComputation(cv.GIn(g_in), cv.GOut(bboxes))
-        pp = cv.gapi.ie.params("net", model_path, weights_path, device_id)
-
-        gapi_boxes = comp.apply(cv.gin(img.astype(np.float32)),
-                                args=cv.compile_args(cv.gapi.networks(pp)))
-
-        # Comparison
-        self.assertEqual(0.0, cv.norm(np.array(dnn_boxes).flatten(),
-                                      np.array(gapi_boxes).flatten(),
-                                      cv.NORM_INF))
+    pass
 
 
 if __name__ == '__main__':
diff --git a/modules/gapi/misc/python/test/test_gapi_render.py b/modules/gapi/misc/python/test/test_gapi_render.py
new file mode 100644
index 000000000000..70601a72e57d
--- /dev/null
+++ b/modules/gapi/misc/python/test/test_gapi_render.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+
+import numpy as np
+import cv2 as cv
+import os
+import sys
+import unittest
+
+from tests_common import NewOpenCVTests
+
+try:
+
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
+
+    # FIXME: FText isn't supported yet.
+    class gapi_render_test(NewOpenCVTests):
+        def __init__(self, *args):
+            super().__init__(*args)
+
+            self.size = (300, 300, 3)
+
+            # Rect
+            self.rect = (30, 30, 50, 50)
+            self.rcolor = (0, 255, 0)
+            self.rlt = cv.LINE_4
+            self.rthick = 2
+            self.rshift = 3
+
+            # Text
+            self.text = 'Hello, world!'
+            self.org = (100, 100)
+            self.ff = cv.FONT_HERSHEY_SIMPLEX
+            self.fs = 1.0
+            self.tthick = 2
+            self.tlt = cv.LINE_8
+            self.tcolor = (255, 255, 255)
+            self.blo = False
+
+            # Circle
+            self.center = (200, 200)
+            self.radius = 200
+            self.ccolor = (255, 255, 0)
+            self.cthick = 2
+            self.clt = cv.LINE_4
+            self.cshift = 1
+
+            # Line
+            self.pt1 = (50, 50)
+            self.pt2 = (200, 200)
+            self.lcolor = (0, 255, 128)
+            self.lthick = 5
+            self.llt = cv.LINE_8
+            self.lshift = 2
+
+            # Poly
+            self.pts = [(50, 100), (100, 200), (25, 250)]
+            self.pcolor = (0, 0, 255)
+            self.pthick = 3
+            self.plt = cv.LINE_4
+            self.pshift = 1
+
+            # Image
+            self.iorg = (150, 150)
+            img_path = self.find_file('cv/face/david2.jpg', [os.environ.get('OPENCV_TEST_DATA_PATH')])
+            self.img = cv.resize(cv.imread(img_path), (50, 50))
+            self.alpha = np.full(self.img.shape[:2], 0.8, dtype=np.float32)
+
+            # Mosaic
+            self.mos = (100, 100, 100, 100)
+            self.cell_sz = 25
+            self.decim = 0
+
+            # Render primitives
+            self.prims = [cv.gapi.wip.draw.Rect(self.rect, self.rcolor, self.rthick, self.rlt, self.rshift),
+                          cv.gapi.wip.draw.Text(self.text, self.org, self.ff, self.fs, self.tcolor, self.tthick, self.tlt, self.blo),
+                          cv.gapi.wip.draw.Circle(self.center, self.radius, self.ccolor, self.cthick, self.clt, self.cshift),
+                          cv.gapi.wip.draw.Line(self.pt1, self.pt2, self.lcolor, self.lthick, self.llt, self.lshift),
+                          cv.gapi.wip.draw.Mosaic(self.mos, self.cell_sz, self.decim),
+                          cv.gapi.wip.draw.Image(self.iorg, self.img, self.alpha),
+                          cv.gapi.wip.draw.Poly(self.pts, self.pcolor, self.pthick, self.plt, self.pshift)]
+
+        def cvt_nv12_to_yuv(self, y, uv):
+            h,w,_ = uv.shape
+            upsample_uv = cv.resize(uv, (h * 2, w * 2))
+            return cv.merge([y, upsample_uv])
+
+        def cvt_yuv_to_nv12(self, yuv, y_out, uv_out):
+            chs = cv.split(yuv, [y_out, None, None])
+            uv = cv.merge([chs[1], chs[2]])
+            uv_out = cv.resize(uv, (uv.shape[0] // 2, uv.shape[1] // 2), dst=uv_out)
+            return y_out, uv_out
+
+        def cvt_bgr_to_yuv_color(self, bgr):
+            y = bgr[2] *  0.299000 + bgr[1] *  0.587000 + bgr[0] *  0.114000;
+            u = bgr[2] * -0.168736 + bgr[1] * -0.331264 + bgr[0] *  0.500000 + 128;
+            v = bgr[2] *  0.500000 + bgr[1] * -0.418688 + bgr[0] * -0.081312 + 128;
+            return (y, u, v)
+
+        def blend_img(self, background, org, img, alpha):
+            x, y = org
+            h, w, _ = img.shape
+            roi_img = background[x:x+w, y:y+h, :]
+            img32f_w = cv.merge([alpha] * 3).astype(np.float32)
+            roi32f_w = np.full(roi_img.shape, 1.0, dtype=np.float32)
+            roi32f_w -= img32f_w
+            img32f = (img / 255).astype(np.float32)
+            roi32f = (roi_img / 255).astype(np.float32)
+            cv.multiply(img32f, img32f_w, dst=img32f)
+            cv.multiply(roi32f, roi32f_w, dst=roi32f)
+            roi32f += img32f
+            roi_img[...] = np.round(roi32f * 255)
+
+        # This is quite naive implementations used as a simple reference
+        # doesn't consider corner cases.
+        def draw_mosaic(self, img, mos, cell_sz, decim):
+            x,y,w,h = mos
+            mosaic_area = img[x:x+w, y:y+h, :]
+            for i in range(0, mosaic_area.shape[0], cell_sz):
+                for j in range(0, mosaic_area.shape[1], cell_sz):
+                    cell_roi = mosaic_area[j:j+cell_sz, i:i+cell_sz, :]
+                    s0, s1, s2 = cv.mean(cell_roi)[:3]
+                    mosaic_area[j:j+cell_sz, i:i+cell_sz] = (round(s0), round(s1), round(s2))
+
+        def render_primitives_bgr_ref(self, img):
+            cv.rectangle(img, self.rect, self.rcolor, self.rthick, self.rlt, self.rshift)
+            cv.putText(img, self.text, self.org, self.ff, self.fs, self.tcolor, self.tthick, self.tlt, self.blo)
+            cv.circle(img, self.center, self.radius, self.ccolor, self.cthick, self.clt, self.cshift)
+            cv.line(img, self.pt1, self.pt2, self.lcolor, self.lthick, self.llt, self.lshift)
+            cv.fillPoly(img, np.expand_dims(np.array([self.pts]), axis=0), self.pcolor, self.plt, self.pshift)
+            self.draw_mosaic(img, self.mos, self.cell_sz, self.decim)
+            self.blend_img(img, self.iorg, self.img, self.alpha)
+
+        def render_primitives_nv12_ref(self, y_plane, uv_plane):
+            yuv = self.cvt_nv12_to_yuv(y_plane, uv_plane)
+            cv.rectangle(yuv, self.rect, self.cvt_bgr_to_yuv_color(self.rcolor), self.rthick, self.rlt, self.rshift)
+            cv.putText(yuv, self.text, self.org, self.ff, self.fs, self.cvt_bgr_to_yuv_color(self.tcolor), self.tthick, self.tlt, self.blo)
+            cv.circle(yuv, self.center, self.radius, self.cvt_bgr_to_yuv_color(self.ccolor), self.cthick, self.clt, self.cshift)
+            cv.line(yuv, self.pt1, self.pt2, self.cvt_bgr_to_yuv_color(self.lcolor), self.lthick, self.llt, self.lshift)
+            cv.fillPoly(yuv, np.expand_dims(np.array([self.pts]), axis=0), self.cvt_bgr_to_yuv_color(self.pcolor), self.plt, self.pshift)
+            self.draw_mosaic(yuv, self.mos, self.cell_sz, self.decim)
+            self.blend_img(yuv, self.iorg, cv.cvtColor(self.img, cv.COLOR_BGR2YUV), self.alpha)
+            self.cvt_yuv_to_nv12(yuv, y_plane, uv_plane)
+
+        def test_render_primitives_on_bgr_graph(self):
+            expected = np.zeros(self.size, dtype=np.uint8)
+            actual = np.array(expected, copy=True)
+
+            # OpenCV
+            self.render_primitives_bgr_ref(expected)
+
+            # G-API
+            g_in = cv.GMat()
+            g_prims = cv.GArray.Prim()
+            g_out = cv.gapi.wip.draw.render3ch(g_in, g_prims)
+
+
+            comp = cv.GComputation(cv.GIn(g_in, g_prims), cv.GOut(g_out))
+            actual = comp.apply(cv.gin(actual, self.prims))
+
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+        def test_render_primitives_on_bgr_function(self):
+            expected = np.zeros(self.size, dtype=np.uint8)
+            actual = np.array(expected, copy=True)
+
+            # OpenCV
+            self.render_primitives_bgr_ref(expected)
+
+            # G-API
+            cv.gapi.wip.draw.render(actual, self.prims)
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+
+        def test_render_primitives_on_nv12_graph(self):
+            y_expected = np.zeros((self.size[0], self.size[1], 1), dtype=np.uint8)
+            uv_expected = np.zeros((self.size[0] // 2, self.size[1] // 2, 2), dtype=np.uint8)
+
+            y_actual = np.array(y_expected, copy=True)
+            uv_actual = np.array(uv_expected, copy=True)
+
+            # OpenCV
+            self.render_primitives_nv12_ref(y_expected, uv_expected)
+
+            # G-API
+            g_y = cv.GMat()
+            g_uv = cv.GMat()
+            g_prims = cv.GArray.Prim()
+            g_out_y, g_out_uv = cv.gapi.wip.draw.renderNV12(g_y, g_uv, g_prims)
+
+            comp = cv.GComputation(cv.GIn(g_y, g_uv, g_prims), cv.GOut(g_out_y, g_out_uv))
+            y_actual, uv_actual = comp.apply(cv.gin(y_actual, uv_actual, self.prims))
+
+            self.assertEqual(0.0, cv.norm(y_expected, y_actual, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(uv_expected, uv_actual, cv.NORM_INF))
+
+        def test_render_primitives_on_nv12_function(self):
+            y_expected = np.zeros((self.size[0], self.size[1], 1), dtype=np.uint8)
+            uv_expected = np.zeros((self.size[0] // 2, self.size[1] // 2, 2), dtype=np.uint8)
+
+            y_actual = np.array(y_expected, copy=True)
+            uv_actual = np.array(uv_expected, copy=True)
+
+            # OpenCV
+            self.render_primitives_nv12_ref(y_expected, uv_expected)
+
+            # G-API
+            cv.gapi.wip.draw.render(y_actual, uv_actual, self.prims)
+
+            self.assertEqual(0.0, cv.norm(y_expected, y_actual, cv.NORM_INF))
+            self.assertEqual(0.0, cv.norm(uv_expected, uv_actual, cv.NORM_INF))
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+    pass
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
index 2f921901db7a..a10d63f09ef2 100644
--- a/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
+++ b/modules/gapi/misc/python/test/test_gapi_sample_pipelines.py
@@ -225,7 +225,7 @@ def test_custom_op_add(self):
             comp = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(g_out))
 
             pkg = cv.gapi.kernels(GAddImpl)
-            actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(in_mat1, in_mat2), args=cv.gapi.compile_args(pkg))
 
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
@@ -245,7 +245,7 @@ def test_custom_op_split3(self):
             comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_ch1, g_ch2, g_ch3))
 
             pkg = cv.gapi.kernels(GSplit3Impl)
-            ch1, ch2, ch3 = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            ch1, ch2, ch3 = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
 
             self.assertEqual(0.0, cv.norm(in_ch1, ch1, cv.NORM_INF))
             self.assertEqual(0.0, cv.norm(in_ch2, ch2, cv.NORM_INF))
@@ -266,7 +266,7 @@ def test_custom_op_mean(self):
             comp = cv.GComputation(g_in, g_out)
 
             pkg    = cv.gapi.kernels(GMeanImpl)
-            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
 
             # Comparison
             self.assertEqual(expected, actual)
@@ -287,7 +287,7 @@ def test_custom_op_addC(self):
             comp  = cv.GComputation(cv.GIn(g_in, g_sc), cv.GOut(g_out))
 
             pkg = cv.gapi.kernels(GAddCImpl)
-            actual = comp.apply(cv.gin(in_mat, sc), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(in_mat, sc), args=cv.gapi.compile_args(pkg))
 
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
@@ -305,7 +305,7 @@ def test_custom_op_size(self):
             comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_sz))
 
             pkg = cv.gapi.kernels(GSizeImpl)
-            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
 
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
@@ -322,7 +322,7 @@ def test_custom_op_sizeR(self):
             comp = cv.GComputation(cv.GIn(g_r), cv.GOut(g_sz))
 
             pkg = cv.gapi.kernels(GSizeRImpl)
-            actual = comp.apply(cv.gin(roi), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(roi), args=cv.gapi.compile_args(pkg))
 
             # cv.norm works with tuples ?
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
@@ -340,7 +340,7 @@ def test_custom_op_boundingRect(self):
             comp  = cv.GComputation(cv.GIn(g_pts), cv.GOut(g_br))
 
             pkg = cv.gapi.kernels(GBoundingRectImpl)
-            actual = comp.apply(cv.gin(points), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(points), args=cv.gapi.compile_args(pkg))
 
             # cv.norm works with tuples ?
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
@@ -371,7 +371,7 @@ def test_custom_op_goodFeaturesToTrack(self):
 
             comp = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
             pkg = cv.gapi.kernels(GGoodFeaturesImpl)
-            actual = comp.apply(cv.gin(in_mat), args=cv.compile_args(pkg))
+            actual = comp.apply(cv.gin(in_mat), args=cv.gapi.compile_args(pkg))
 
             # NB: OpenCV & G-API have different output types.
             # OpenCV - numpy array with shape (num_points, 1, 2)
@@ -453,10 +453,10 @@ def run(arr):
             g_in  = cv.GArray.Int()
             comp  = cv.GComputation(cv.GIn(g_in), cv.GOut(GSum.on(g_in)))
 
-            s = comp.apply(cv.gin([1, 2, 3, 4]), args=cv.compile_args(cv.gapi.kernels(GSumImpl)))
+            s = comp.apply(cv.gin([1, 2, 3, 4]), args=cv.gapi.compile_args(cv.gapi.kernels(GSumImpl)))
             self.assertEqual(10, s)
 
-            s = comp.apply(cv.gin([1, 2, 8, 7]), args=cv.compile_args(cv.gapi.kernels(GSumImpl)))
+            s = comp.apply(cv.gin([1, 2, 8, 7]), args=cv.gapi.compile_args(cv.gapi.kernels(GSumImpl)))
             self.assertEqual(18, s)
 
             self.assertEqual(18, GSumImpl.last_result)
@@ -488,13 +488,13 @@ def run(table, key):
                         'tuple': (42, 42)
                     }
 
-            out = comp.apply(cv.gin(table, 'int'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            out = comp.apply(cv.gin(table, 'int'), args=cv.gapi.compile_args(cv.gapi.kernels(GLookUpImpl)))
             self.assertEqual(42, out)
 
-            out = comp.apply(cv.gin(table, 'str'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            out = comp.apply(cv.gin(table, 'str'), args=cv.gapi.compile_args(cv.gapi.kernels(GLookUpImpl)))
             self.assertEqual('hello, world!', out)
 
-            out = comp.apply(cv.gin(table, 'tuple'), args=cv.compile_args(cv.gapi.kernels(GLookUpImpl)))
+            out = comp.apply(cv.gin(table, 'tuple'), args=cv.gapi.compile_args(cv.gapi.kernels(GLookUpImpl)))
             self.assertEqual((42, 42), out)
 
 
@@ -521,7 +521,7 @@ def run(arr0, arr1):
             arr1 = [3,    'str']
 
             out = comp.apply(cv.gin(arr0, arr1),
-                             args=cv.compile_args(cv.gapi.kernels(GConcatImpl)))
+                             args=cv.gapi.compile_args(cv.gapi.kernels(GConcatImpl)))
 
             self.assertEqual(arr0 + arr1, out)
 
@@ -550,7 +550,7 @@ def run(img0, img1):
             img1 = np.array([1, 2, 3])
 
             with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
-                                                          args=cv.compile_args(
+                                                          args=cv.gapi.compile_args(
                                                               cv.gapi.kernels(GAddImpl)))
 
 
@@ -577,7 +577,7 @@ def run(img0, img1):
             img1 = np.array([1, 2, 3])
 
             with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
-                                                          args=cv.compile_args(
+                                                          args=cv.gapi.compile_args(
                                                               cv.gapi.kernels(GAddImpl)))
 
 
@@ -607,7 +607,7 @@ def run(img0, img1):
             # FIXME: Cause Bad variant access.
             # Need to provide more descriptive error messsage.
             with self.assertRaises(Exception): comp.apply(cv.gin(img0, img1),
-                                                          args=cv.compile_args(
+                                                          args=cv.gapi.compile_args(
                                                               cv.gapi.kernels(GAddImpl)))
 
         def test_pipeline_with_custom_kernels(self):
@@ -657,7 +657,7 @@ def run(img, order):
             g_mean       = cv.gapi.mean(g_transposed)
 
             comp = cv.GComputation(cv.GIn(g_bgr), cv.GOut(g_mean))
-            actual = comp.apply(cv.gin(img), args=cv.compile_args(
+            actual = comp.apply(cv.gin(img), args=cv.gapi.compile_args(
                 cv.gapi.kernels(GResizeImpl, GTransposeImpl)))
 
             self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
diff --git a/modules/gapi/misc/python/test/test_gapi_streaming.py b/modules/gapi/misc/python/test/test_gapi_streaming.py
index 5356abc76afd..7ede1b5cf38d 100644
--- a/modules/gapi/misc/python/test/test_gapi_streaming.py
+++ b/modules/gapi/misc/python/test/test_gapi_streaming.py
@@ -3,201 +3,323 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
+import time
 
 from tests_common import NewOpenCVTests
 
-class test_gapi_streaming(NewOpenCVTests):
 
-    def test_image_input(self):
-        sz = (1280, 720)
-        in_mat = np.random.randint(0, 100, sz).astype(np.uint8)
+try:
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
-        # OpenCV
-        expected = cv.medianBlur(in_mat, 3)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.medianBlur(g_in, 3)
-        c = cv.GComputation(g_in, g_out)
-        ccomp = c.compileStreaming(cv.descr_of(in_mat))
-        ccomp.setSource(cv.gin(in_mat))
-        ccomp.start()
+    @cv.gapi.op('custom.delay', in_types=[cv.GMat], out_types=[cv.GMat])
+    class GDelay:
+        """Delay for 10 ms."""
 
-        _, actual = ccomp.pull()
+        @staticmethod
+        def outMeta(desc):
+            return desc
 
-        # Assert
-        self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
+    @cv.gapi.kernel(GDelay)
+    class GDelayImpl:
+        """Implementation for GDelay operation."""
 
-    def test_video_input(self):
-        ksize = 3
-        path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+        @staticmethod
+        def run(img):
+            time.sleep(0.01)
+            return img
 
-        # OpenCV
-        cap = cv.VideoCapture(path)
 
-        # G-API
-        g_in = cv.GMat()
-        g_out = cv.gapi.medianBlur(g_in, ksize)
-        c = cv.GComputation(g_in, g_out)
+    class test_gapi_streaming(NewOpenCVTests):
 
-        ccomp = c.compileStreaming()
-        source = cv.gapi.wip.make_capture_src(path)
-        ccomp.setSource(source)
-        ccomp.start()
+        def test_image_input(self):
+            sz = (1280, 720)
+            in_mat = np.random.randint(0, 100, sz).astype(np.uint8)
 
-        # Assert
-        max_num_frames  = 10
-        proc_num_frames = 0
-        while cap.isOpened():
-            has_expected, expected = cap.read()
-            has_actual,   actual   = ccomp.pull()
+            # OpenCV
+            expected = cv.medianBlur(in_mat, 3)
+
+            # G-API
+            g_in = cv.GMat()
+            g_out = cv.gapi.medianBlur(g_in, 3)
+            c = cv.GComputation(g_in, g_out)
+            ccomp = c.compileStreaming(cv.gapi.descr_of(in_mat))
+            ccomp.setSource(cv.gin(in_mat))
+            ccomp.start()
 
-            self.assertEqual(has_expected, has_actual)
+            _, actual = ccomp.pull()
+
+            # Assert
+            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-            if not has_actual:
-                break
 
-            self.assertEqual(0.0, cv.norm(cv.medianBlur(expected, ksize), actual, cv.NORM_INF))
+        def test_video_input(self):
+            ksize = 3
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
 
-            proc_num_frames += 1
-            if proc_num_frames == max_num_frames:
-                break;
+            # OpenCV
+            cap = cv.VideoCapture(path)
 
+            # G-API
+            g_in = cv.GMat()
+            g_out = cv.gapi.medianBlur(g_in, ksize)
+            c = cv.GComputation(g_in, g_out)
 
-    def test_video_split3(self):
-        path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+            ccomp = c.compileStreaming()
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source))
+            ccomp.start()
 
-        # OpenCV
-        cap = cv.VideoCapture(path)
+            # Assert
+            max_num_frames  = 10
+            proc_num_frames = 0
+            while cap.isOpened():
+                has_expected, expected = cap.read()
+                has_actual,   actual   = ccomp.pull()
 
-        # G-API
-        g_in = cv.GMat()
-        b, g, r = cv.gapi.split3(g_in)
-        c = cv.GComputation(cv.GIn(g_in), cv.GOut(b, g, r))
+                self.assertEqual(has_expected, has_actual)
 
-        ccomp = c.compileStreaming()
-        source = cv.gapi.wip.make_capture_src(path)
-        ccomp.setSource(source)
-        ccomp.start()
+                if not has_actual:
+                    break
 
-        # Assert
-        max_num_frames  = 10
-        proc_num_frames = 0
-        while cap.isOpened():
-            has_expected, frame = cap.read()
-            has_actual,   actual   = ccomp.pull()
+                self.assertEqual(0.0, cv.norm(cv.medianBlur(expected, ksize), actual, cv.NORM_INF))
 
-            self.assertEqual(has_expected, has_actual)
+                proc_num_frames += 1
+                if proc_num_frames == max_num_frames:
+                    break
 
-            if not has_actual:
-                break
 
-            expected = cv.split(frame)
-            for e, a in zip(expected, actual):
-                self.assertEqual(0.0, cv.norm(e, a, cv.NORM_INF))
+        def test_video_split3(self):
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
 
-            proc_num_frames += 1
-            if proc_num_frames == max_num_frames:
-                break;
+            # OpenCV
+            cap = cv.VideoCapture(path)
 
+            # G-API
+            g_in = cv.GMat()
+            b, g, r = cv.gapi.split3(g_in)
+            c = cv.GComputation(cv.GIn(g_in), cv.GOut(b, g, r))
 
-    def test_video_add(self):
-        sz = (576, 768, 3)
-        in_mat = np.random.randint(0, 100, sz).astype(np.uint8)
+            ccomp = c.compileStreaming()
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source))
+            ccomp.start()
 
-        path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+            # Assert
+            max_num_frames  = 10
+            proc_num_frames = 0
+            while cap.isOpened():
+                has_expected, frame = cap.read()
+                has_actual,   actual   = ccomp.pull()
 
-        # OpenCV
-        cap = cv.VideoCapture(path)
+                self.assertEqual(has_expected, has_actual)
 
-        # G-API
-        g_in1 = cv.GMat()
-        g_in2 = cv.GMat()
-        out = cv.gapi.add(g_in1, g_in2)
-        c = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(out))
+                if not has_actual:
+                    break
 
-        ccomp = c.compileStreaming()
-        source = cv.gapi.wip.make_capture_src(path)
-        ccomp.setSource(cv.gin(source, in_mat))
-        ccomp.start()
+                expected = cv.split(frame)
+                for e, a in zip(expected, actual):
+                    self.assertEqual(0.0, cv.norm(e, a, cv.NORM_INF))
 
-        # Assert
-        max_num_frames  = 10
-        proc_num_frames = 0
-        while cap.isOpened():
-            has_expected, frame  = cap.read()
-            has_actual,   actual = ccomp.pull()
+                proc_num_frames += 1
+                if proc_num_frames == max_num_frames:
+                    break
 
-            self.assertEqual(has_expected, has_actual)
 
-            if not has_actual:
-                break
+        def test_video_add(self):
+            sz = (576, 768, 3)
+            in_mat = np.random.randint(0, 100, sz).astype(np.uint8)
 
-            expected = cv.add(frame, in_mat)
-            self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
 
-            proc_num_frames += 1
-            if proc_num_frames == max_num_frames:
-                break;
+            # OpenCV
+            cap = cv.VideoCapture(path)
 
+            # G-API
+            g_in1 = cv.GMat()
+            g_in2 = cv.GMat()
+            out = cv.gapi.add(g_in1, g_in2)
+            c = cv.GComputation(cv.GIn(g_in1, g_in2), cv.GOut(out))
 
-    def test_video_good_features_to_track(self):
-        path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+            ccomp = c.compileStreaming()
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source, in_mat))
+            ccomp.start()
 
-        # NB: goodFeaturesToTrack configuration
-        max_corners         = 50
-        quality_lvl         = 0.01
-        min_distance        = 10
-        block_sz            = 3
-        use_harris_detector = True
-        k                   = 0.04
-        mask                = None
+            # Assert
+            max_num_frames  = 10
+            proc_num_frames = 0
+            while cap.isOpened():
+                has_expected, frame  = cap.read()
+                has_actual,   actual = ccomp.pull()
 
-        # OpenCV
-        cap = cv.VideoCapture(path)
+                self.assertEqual(has_expected, has_actual)
 
-        # G-API
-        g_in = cv.GMat()
-        g_gray = cv.gapi.RGB2Gray(g_in)
-        g_out = cv.gapi.goodFeaturesToTrack(g_gray, max_corners, quality_lvl,
-                                            min_distance, mask, block_sz, use_harris_detector, k)
+                if not has_actual:
+                    break
 
-        c = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+                expected = cv.add(frame, in_mat)
+                self.assertEqual(0.0, cv.norm(expected, actual, cv.NORM_INF))
 
-        ccomp = c.compileStreaming()
-        source = cv.gapi.wip.make_capture_src(path)
-        ccomp.setSource(source)
-        ccomp.start()
+                proc_num_frames += 1
+                if proc_num_frames == max_num_frames:
+                    break
 
-        # Assert
-        max_num_frames  = 10
-        proc_num_frames = 0
-        while cap.isOpened():
-            has_expected, frame  = cap.read()
-            has_actual,   actual = ccomp.pull()
 
-            self.assertEqual(has_expected, has_actual)
+        def test_video_good_features_to_track(self):
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
 
-            if not has_actual:
-                break
+            # NB: goodFeaturesToTrack configuration
+            max_corners         = 50
+            quality_lvl         = 0.01
+            min_distance        = 10
+            block_sz            = 3
+            use_harris_detector = True
+            k                   = 0.04
+            mask                = None
 
             # OpenCV
-            frame = cv.cvtColor(frame, cv.COLOR_RGB2GRAY)
-            expected = cv.goodFeaturesToTrack(frame, max_corners, quality_lvl,
-                                              min_distance, mask=mask,
-                                              blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
-            for e, a in zip(expected, actual):
-                # NB: OpenCV & G-API have different output shapes:
-                # OpenCV - (num_points, 1, 2)
-                # G-API  - (num_points, 2)
-                self.assertEqual(0.0, cv.norm(e.flatten(),
-                                              np.array(a, np.float32).flatten(),
-                                              cv.NORM_INF))
-
-            proc_num_frames += 1
-            if proc_num_frames == max_num_frames:
-                break;
+            cap = cv.VideoCapture(path)
+
+            # G-API
+            g_in = cv.GMat()
+            g_gray = cv.gapi.RGB2Gray(g_in)
+            g_out = cv.gapi.goodFeaturesToTrack(g_gray, max_corners, quality_lvl,
+                                                min_distance, mask, block_sz, use_harris_detector, k)
+
+            c = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out))
+
+            ccomp = c.compileStreaming()
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source))
+            ccomp.start()
+
+            # Assert
+            max_num_frames  = 10
+            proc_num_frames = 0
+            while cap.isOpened():
+                has_expected, frame  = cap.read()
+                has_actual,   actual = ccomp.pull()
+
+                self.assertEqual(has_expected, has_actual)
+
+                if not has_actual:
+                    break
+
+                # OpenCV
+                frame = cv.cvtColor(frame, cv.COLOR_RGB2GRAY)
+                expected = cv.goodFeaturesToTrack(frame, max_corners, quality_lvl,
+                                                  min_distance, mask=mask,
+                                                  blockSize=block_sz, useHarrisDetector=use_harris_detector, k=k)
+                for e, a in zip(expected, actual):
+                    # NB: OpenCV & G-API have different output shapes:
+                    # OpenCV - (num_points, 1, 2)
+                    # G-API  - (num_points, 2)
+                    self.assertEqual(0.0, cv.norm(e.flatten(),
+                                                  np.array(a, np.float32).flatten(),
+                                                  cv.NORM_INF))
+
+                proc_num_frames += 1
+                if proc_num_frames == max_num_frames:
+                    break
+
+
+        def test_gapi_streaming_meta(self):
+            ksize = 3
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+
+            # G-API
+            g_in = cv.GMat()
+            g_ts = cv.gapi.streaming.timestamp(g_in)
+            g_seqno = cv.gapi.streaming.seqNo(g_in)
+            g_seqid = cv.gapi.streaming.seq_id(g_in)
+
+            c = cv.GComputation(cv.GIn(g_in), cv.GOut(g_ts, g_seqno, g_seqid))
+
+            ccomp = c.compileStreaming()
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source))
+            ccomp.start()
+
+            # Assert
+            max_num_frames  = 10
+            curr_frame_number = 0
+            while True:
+                has_frame, (ts, seqno, seqid) = ccomp.pull()
+
+                if not has_frame:
+                    break
+
+                self.assertEqual(curr_frame_number, seqno)
+                self.assertEqual(curr_frame_number, seqid)
+
+                curr_frame_number += 1
+                if curr_frame_number == max_num_frames:
+                    break
+
+        def test_desync(self):
+            path = self.find_file('cv/video/768x576.avi', [os.environ['OPENCV_TEST_DATA_PATH']])
+
+            # G-API
+            g_in = cv.GMat()
+            g_out1 = cv.gapi.copy(g_in)
+            des = cv.gapi.streaming.desync(g_in)
+            g_out2 = GDelay.on(des)
+
+            c = cv.GComputation(cv.GIn(g_in), cv.GOut(g_out1, g_out2))
+
+            kernels = cv.gapi.kernels(GDelayImpl)
+            ccomp = c.compileStreaming(args=cv.gapi.compile_args(kernels))
+            source = cv.gapi.wip.make_capture_src(path)
+            ccomp.setSource(cv.gin(source))
+            ccomp.start()
+
+            # Assert
+            max_num_frames  = 10
+            proc_num_frames = 0
+
+            out_counter = 0
+            desync_out_counter = 0
+            none_counter = 0
+            while True:
+                has_frame, (out1, out2) = ccomp.pull()
+                if not has_frame:
+                    break
+
+                if not out1 is None:
+                    out_counter += 1
+                if not out2 is None:
+                    desync_out_counter += 1
+                else:
+                    none_counter += 1
+
+                proc_num_frames += 1
+                if proc_num_frames == max_num_frames:
+                    ccomp.stop()
+                    break
+
+            self.assertLess(0, proc_num_frames)
+            self.assertLess(desync_out_counter, out_counter)
+            self.assertLess(0, none_counter)
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+    pass
+
 
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/gapi/misc/python/test/test_gapi_types.py b/modules/gapi/misc/python/test/test_gapi_types.py
index 0f3b194a2f97..dde554f5e10a 100644
--- a/modules/gapi/misc/python/test/test_gapi_types.py
+++ b/modules/gapi/misc/python/test/test_gapi_types.py
@@ -3,29 +3,51 @@
 import numpy as np
 import cv2 as cv
 import os
+import sys
+import unittest
 
 from tests_common import NewOpenCVTests
 
-class gapi_types_test(NewOpenCVTests):
 
-    def test_garray_type(self):
-        types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
-                 cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
-                 cv.gapi.CV_RECT  , cv.gapi.CV_SCALAR, cv.gapi.CV_MAT    , cv.gapi.CV_GMAT]
+try:
 
-        for t in types:
-            g_array = cv.GArrayT(t)
-            self.assertEqual(t, g_array.type())
+    if sys.version_info[:2] < (3, 0):
+        raise unittest.SkipTest('Python 2.x is not supported')
 
+    class gapi_types_test(NewOpenCVTests):
 
-    def test_gopaque_type(self):
-        types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
-                 cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
-                 cv.gapi.CV_RECT]
+        def test_garray_type(self):
+            types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
+                     cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
+                     cv.gapi.CV_RECT  , cv.gapi.CV_SCALAR, cv.gapi.CV_MAT    , cv.gapi.CV_GMAT]
 
-        for t in types:
-            g_opaque = cv.GOpaqueT(t)
-            self.assertEqual(t, g_opaque.type())
+            for t in types:
+                g_array = cv.GArrayT(t)
+                self.assertEqual(t, g_array.type())
+
+
+        def test_gopaque_type(self):
+            types = [cv.gapi.CV_BOOL  , cv.gapi.CV_INT   , cv.gapi.CV_DOUBLE , cv.gapi.CV_FLOAT,
+                     cv.gapi.CV_STRING, cv.gapi.CV_POINT , cv.gapi.CV_POINT2F, cv.gapi.CV_SIZE ,
+                     cv.gapi.CV_RECT]
+
+            for t in types:
+                g_opaque = cv.GOpaqueT(t)
+                self.assertEqual(t, g_opaque.type())
+
+
+except unittest.SkipTest as e:
+
+    message = str(e)
+
+    class TestSkip(unittest.TestCase):
+        def setUp(self):
+            self.skipTest('Skip tests: ' + message)
+
+        def test_skip():
+            pass
+
+    pass
 
 
 if __name__ == '__main__':
diff --git a/modules/gapi/samples/api_ref_snippets.cpp b/modules/gapi/samples/api_ref_snippets.cpp
index 6c660fb8fa2e..0abcab89b383 100644
--- a/modules/gapi/samples/api_ref_snippets.cpp
+++ b/modules/gapi/samples/api_ref_snippets.cpp
@@ -4,6 +4,10 @@
 #include <opencv2/gapi/core.hpp>
 #include <opencv2/gapi/imgproc.hpp>
 
+#include <opencv2/gapi/s11n.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+
 #include <opencv2/gapi/cpu/gcpukernel.hpp>
 
 #include <opencv2/gapi/fluid/core.hpp>
@@ -55,6 +59,120 @@ static void typed_example()
     //! [Typed_Example]
 }
 
+static void bind_serialization_example()
+{
+    // ! [bind after deserialization]
+    cv::GCompiled compd;
+    std::vector<char> bytes;
+    auto graph = cv::gapi::deserialize<cv::GComputation>(bytes);
+    auto meta = cv::gapi::deserialize<cv::GMetaArgs>(bytes);
+
+    compd = graph.compile(std::move(meta), cv::compile_args());
+    auto in_args  = cv::gapi::deserialize<cv::GRunArgs>(bytes);
+    auto out_args = cv::gapi::deserialize<cv::GRunArgs>(bytes);
+    compd(std::move(in_args), cv::gapi::bind(out_args));
+    // ! [bind after deserialization]
+}
+
+static void bind_deserialization_example()
+{
+    // ! [bind before serialization]
+    std::vector<cv::GRunArgP> graph_outs;
+    cv::GRunArgs out_args;
+
+    for (auto &&out : graph_outs) {
+        out_args.emplace_back(cv::gapi::bind(out));
+    }
+    const auto sargsout = cv::gapi::serialize(out_args);
+    // ! [bind before serialization]
+}
+
+struct SimpleCustomType {
+    bool val;
+    bool operator==(const SimpleCustomType& other) const {
+        return val == other.val;
+    }
+};
+
+struct SimpleCustomType2 {
+    int val;
+    std::string name;
+    std::vector<float> vec;
+    std::map<int, uint64_t> mmap;
+    bool operator==(const SimpleCustomType2& other) const {
+        return val == other.val && name == other.name &&
+               vec == other.vec && mmap == other.mmap;
+    }
+};
+
+// ! [S11N usage]
+namespace cv {
+namespace gapi {
+namespace s11n {
+namespace detail {
+template<> struct S11N<SimpleCustomType> {
+    static void serialize(IOStream &os, const SimpleCustomType &p) {
+        os << p.val;
+    }
+    static SimpleCustomType deserialize(IIStream &is) {
+        SimpleCustomType p;
+        is >> p.val;
+        return p;
+    }
+};
+
+template<> struct S11N<SimpleCustomType2> {
+    static void serialize(IOStream &os, const SimpleCustomType2 &p) {
+        os << p.val << p.name << p.vec << p.mmap;
+    }
+    static SimpleCustomType2 deserialize(IIStream &is) {
+        SimpleCustomType2 p;
+        is >> p.val >> p.name >> p.vec >> p.mmap;
+        return p;
+    }
+};
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+// ! [S11N usage]
+
+namespace cv {
+namespace detail {
+template<> struct CompileArgTag<SimpleCustomType> {
+    static const char* tag() {
+        return "org.opencv.test.simple_custom_type";
+    }
+};
+
+template<> struct CompileArgTag<SimpleCustomType2> {
+    static const char* tag() {
+        return "org.opencv.test.simple_custom_type_2";
+    }
+};
+} // namespace detail
+} // namespace cv
+
+static void s11n_example()
+{
+    SimpleCustomType  customVar1 { false };
+    SimpleCustomType2 customVar2 { 1248, "World", {1280, 720, 640, 480},
+                                   { {5, 32434142342}, {7, 34242432} } };
+
+    std::vector<char> sArgs = cv::gapi::serialize(
+        cv::compile_args(customVar1, customVar2));
+
+    cv::GCompileArgs dArgs = cv::gapi::deserialize<cv::GCompileArgs,
+                                                   SimpleCustomType,
+                                                   SimpleCustomType2>(sArgs);
+
+    SimpleCustomType  dCustomVar1 = cv::gapi::getCompileArg<SimpleCustomType>(dArgs).value();
+    SimpleCustomType2 dCustomVar2 = cv::gapi::getCompileArg<SimpleCustomType2>(dArgs).value();
+
+    (void) dCustomVar1;
+    (void) dCustomVar2;
+}
+
 G_TYPED_KERNEL(IAdd, <cv::GMat(cv::GMat)>, "test.custom.add") {
     static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
 };
@@ -128,5 +246,8 @@ int main(int argc, char *argv[])
     // unused functions
     typed_example();
     gscalar_example();
+    bind_serialization_example();
+    bind_deserialization_example();
+    s11n_example();
     return 0;
 }
diff --git a/modules/gapi/samples/face_detection_mtcnn.cpp b/modules/gapi/samples/face_detection_mtcnn.cpp
index c437bdbba46c..50cb666a90f5 100644
--- a/modules/gapi/samples/face_detection_mtcnn.cpp
+++ b/modules/gapi/samples/face_detection_mtcnn.cpp
@@ -589,30 +589,23 @@ int main(int argc, char* argv[]) {
     //Preprocessing BGR2RGB + transpose (NCWH is expected instead of NCHW)
     cv::GMat in_original;
     cv::GMat in_originalRGB = cv::gapi::BGR2RGB(in_original);
+    cv::GMat in_transposedRGB = cv::gapi::transpose(in_originalRGB);
     cv::GOpaque<cv::Size> in_sz = cv::gapi::streaming::size(in_original);
-    cv::GMat in_resized[MAX_PYRAMID_LEVELS];
-    cv::GMat in_transposed[MAX_PYRAMID_LEVELS];
     cv::GMat regressions[MAX_PYRAMID_LEVELS];
     cv::GMat scores[MAX_PYRAMID_LEVELS];
     cv::GArray<custom::Face> nms_p_faces[MAX_PYRAMID_LEVELS];
     cv::GArray<custom::Face> total_faces[MAX_PYRAMID_LEVELS];
-    cv::GArray<custom::Face> faces_init(std::vector<custom::Face>{});
 
     //The very first PNet pyramid layer to init total_faces[0]
-    in_resized[0] = cv::gapi::resize(in_originalRGB, level_size[0]);
-    in_transposed[0] = cv::gapi::transpose(in_resized[0]);
-    std::tie(regressions[0], scores[0]) = run_mtcnn_p(in_transposed[0], get_pnet_level_name(level_size[0]));
+    std::tie(regressions[0], scores[0]) = run_mtcnn_p(in_transposedRGB, get_pnet_level_name(level_size[0]));
     cv::GArray<custom::Face> faces0 = custom::BuildFaces::on(scores[0], regressions[0], static_cast<float>(scales[0]), conf_thresh_p);
     cv::GArray<custom::Face> final_p_faces_for_bb2squares = custom::ApplyRegression::on(faces0, true);
     cv::GArray<custom::Face> final_faces_pnet0 = custom::BBoxesToSquares::on(final_p_faces_for_bb2squares);
-    nms_p_faces[0] = custom::RunNMS::on(final_faces_pnet0, 0.5f, false);
-    total_faces[0] = custom::AccumulatePyramidOutputs::on(faces_init, nms_p_faces[0]);
+    total_faces[0] = custom::RunNMS::on(final_faces_pnet0, 0.5f, false);
     //The rest PNet pyramid layers to accumlate all layers result in total_faces[PYRAMID_LEVELS - 1]]
     for (int i = 1; i < pyramid_levels; ++i)
     {
-        in_resized[i] = cv::gapi::resize(in_originalRGB, level_size[i]);
-        in_transposed[i] = cv::gapi::transpose(in_resized[i]);
-        std::tie(regressions[i], scores[i]) = run_mtcnn_p(in_transposed[i], get_pnet_level_name(level_size[i]));
+        std::tie(regressions[i], scores[i]) = run_mtcnn_p(in_transposedRGB, get_pnet_level_name(level_size[i]));
         cv::GArray<custom::Face> faces = custom::BuildFaces::on(scores[i], regressions[i], static_cast<float>(scales[i]), conf_thresh_p);
         cv::GArray<custom::Face> final_p_faces_for_bb2squares_i = custom::ApplyRegression::on(faces, true);
         cv::GArray<custom::Face> final_faces_pnet_i = custom::BBoxesToSquares::on(final_p_faces_for_bb2squares_i);
@@ -626,8 +619,7 @@ int main(int argc, char* argv[]) {
     //Refinement part of MTCNN graph
     cv::GArray<cv::Rect> faces_roi_pnet = custom::R_O_NetPreProcGetROIs::on(final_faces_pnet, in_sz);
     cv::GArray<cv::GMat> regressionsRNet, scoresRNet;
-    cv::GMat in_originalRGB_transposed = cv::gapi::transpose(in_originalRGB);
-    std::tie(regressionsRNet, scoresRNet) = cv::gapi::infer<custom::MTCNNRefinement>(faces_roi_pnet, in_originalRGB_transposed);
+    std::tie(regressionsRNet, scoresRNet) = cv::gapi::infer<custom::MTCNNRefinement>(faces_roi_pnet, in_transposedRGB);
 
     //Refinement post-processing
     cv::GArray<custom::Face> rnet_post_proc_faces = custom::RNetPostProc::on(final_faces_pnet, scoresRNet, regressionsRNet, conf_thresh_r);
@@ -638,7 +630,7 @@ int main(int argc, char* argv[]) {
     //Output part of MTCNN graph
     cv::GArray<cv::Rect> faces_roi_rnet = custom::R_O_NetPreProcGetROIs::on(final_faces_rnet, in_sz);
     cv::GArray<cv::GMat> regressionsONet, scoresONet, landmarksONet;
-    std::tie(regressionsONet, landmarksONet, scoresONet) = cv::gapi::infer<custom::MTCNNOutput>(faces_roi_rnet, in_originalRGB_transposed);
+    std::tie(regressionsONet, landmarksONet, scoresONet) = cv::gapi::infer<custom::MTCNNOutput>(faces_roi_rnet, in_transposedRGB);
 
     //Output post-processing
     cv::GArray<custom::Face> onet_post_proc_faces = custom::ONetPostProc::on(final_faces_rnet, scoresONet, regressionsONet, landmarksONet, conf_thresh_o);
diff --git a/modules/gapi/samples/onevpl_infer_single_roi.cpp b/modules/gapi/samples/onevpl_infer_single_roi.cpp
new file mode 100644
index 000000000000..8a7efafabfd8
--- /dev/null
+++ b/modules/gapi/samples/onevpl_infer_single_roi.cpp
@@ -0,0 +1,254 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <cctype>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+#include <opencv2/gapi/infer/ie.hpp>
+#include <opencv2/gapi/render.hpp>
+#include <opencv2/gapi/streaming/onevpl/onevpl_source.hpp>
+#include <opencv2/highgui.hpp> // CommandLineParser
+
+const std::string about =
+    "This is an OpenCV-based version of oneVPLSource decoder example";
+const std::string keys =
+    "{ h help       |                                           | Print this help message }"
+    "{ input        |                                           | Path to the input demultiplexed video file }"
+    "{ output       |                                           | Path to the output RAW video file. Use .avi extension }"
+    "{ facem        | face-detection-adas-0001.xml              | Path to OpenVINO IE face detection model (.xml) }"
+    "{ faced        | CPU                                       | Target device for face detection model (e.g. CPU, GPU, VPU, ...) }";
+
+namespace {
+std::string get_weights_path(const std::string &model_path) {
+    const auto EXT_LEN = 4u;
+    const auto sz = model_path.size();
+    CV_Assert(sz > EXT_LEN);
+
+    auto ext = model_path.substr(sz - EXT_LEN);
+    std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c){
+            return static_cast<unsigned char>(std::tolower(c));
+        });
+    CV_Assert(ext == ".xml");
+    return model_path.substr(0u, sz - EXT_LEN) + ".bin";
+}
+} // anonymous namespace
+
+namespace custom {
+G_API_NET(FaceDetector,   <cv::GMat(cv::GMat)>, "face-detector");
+
+using GDetections = cv::GArray<cv::Rect>;
+using GRect       = cv::GOpaque<cv::Rect>;
+using GSize       = cv::GOpaque<cv::Size>;
+using GPrims      = cv::GArray<cv::gapi::wip::draw::Prim>;
+
+G_API_OP(LocateROI, <GRect(GSize)>, "sample.custom.locate-roi") {
+    static cv::GOpaqueDesc outMeta(const cv::GOpaqueDesc &) {
+        return cv::empty_gopaque_desc();
+    }
+};
+
+G_API_OP(ParseSSD, <GDetections(cv::GMat, GRect, GSize)>, "sample.custom.parse-ssd") {
+    static cv::GArrayDesc outMeta(const cv::GMatDesc &, const cv::GOpaqueDesc &, const cv::GOpaqueDesc &) {
+        return cv::empty_array_desc();
+    }
+};
+
+G_API_OP(BBoxes, <GPrims(GDetections, GRect)>, "sample.custom.b-boxes") {
+    static cv::GArrayDesc outMeta(const cv::GArrayDesc &, const cv::GOpaqueDesc &) {
+        return cv::empty_array_desc();
+    }
+};
+
+GAPI_OCV_KERNEL(OCVLocateROI, LocateROI) {
+    // This is the place where we can run extra analytics
+    // on the input image frame and select the ROI (region
+    // of interest) where we want to detect our objects (or
+    // run any other inference).
+    //
+    // Currently it doesn't do anything intelligent,
+    // but only crops the input image to square (this is
+    // the most convenient aspect ratio for detectors to use)
+
+    static void run(const cv::Size& in_size, cv::Rect &out_rect) {
+
+        // Identify the central point & square size (- some padding)
+        const auto center = cv::Point{in_size.width/2, in_size.height/2};
+        auto sqside = std::min(in_size.width, in_size.height);
+
+        // Now build the central square ROI
+        out_rect = cv::Rect{ center.x - sqside/2
+                           , center.y - sqside/2
+                           , sqside
+                           , sqside
+                           };
+    }
+};
+
+GAPI_OCV_KERNEL(OCVParseSSD, ParseSSD) {
+    static void run(const cv::Mat &in_ssd_result,
+                    const cv::Rect &in_roi,
+                    const cv::Size &in_parent_size,
+                    std::vector<cv::Rect> &out_objects) {
+        const auto &in_ssd_dims = in_ssd_result.size;
+        CV_Assert(in_ssd_dims.dims() == 4u);
+
+        const int MAX_PROPOSALS = in_ssd_dims[2];
+        const int OBJECT_SIZE   = in_ssd_dims[3];
+        CV_Assert(OBJECT_SIZE  == 7); // fixed SSD object size
+
+        const cv::Size up_roi = in_roi.size();
+        const cv::Rect surface({0,0}, in_parent_size);
+
+        out_objects.clear();
+
+        const float *data = in_ssd_result.ptr<float>();
+        for (int i = 0; i < MAX_PROPOSALS; i++) {
+            const float image_id   = data[i * OBJECT_SIZE + 0];
+            const float label      = data[i * OBJECT_SIZE + 1];
+            const float confidence = data[i * OBJECT_SIZE + 2];
+            const float rc_left    = data[i * OBJECT_SIZE + 3];
+            const float rc_top     = data[i * OBJECT_SIZE + 4];
+            const float rc_right   = data[i * OBJECT_SIZE + 5];
+            const float rc_bottom  = data[i * OBJECT_SIZE + 6];
+            (void) label; // unused
+
+            if (image_id < 0.f) {
+                break;    // marks end-of-detections
+            }
+            if (confidence < 0.5f) {
+                continue; // skip objects with low confidence
+            }
+
+            // map relative coordinates to the original image scale
+            // taking the ROI into account
+            cv::Rect rc;
+            rc.x      = static_cast<int>(rc_left   * up_roi.width);
+            rc.y      = static_cast<int>(rc_top    * up_roi.height);
+            rc.width  = static_cast<int>(rc_right  * up_roi.width)  - rc.x;
+            rc.height = static_cast<int>(rc_bottom * up_roi.height) - rc.y;
+            rc.x += in_roi.x;
+            rc.y += in_roi.y;
+            out_objects.emplace_back(rc & surface);
+        }
+    }
+};
+
+GAPI_OCV_KERNEL(OCVBBoxes, BBoxes) {
+    // This kernel converts the rectangles into G-API's
+    // rendering primitives
+    static void run(const std::vector<cv::Rect> &in_face_rcs,
+                    const             cv::Rect  &in_roi,
+                          std::vector<cv::gapi::wip::draw::Prim> &out_prims) {
+        out_prims.clear();
+        const auto cvt = [](const cv::Rect &rc, const cv::Scalar &clr) {
+            return cv::gapi::wip::draw::Rect(rc, clr, 2);
+        };
+        out_prims.emplace_back(cvt(in_roi, CV_RGB(0,255,255))); // cyan
+        for (auto &&rc : in_face_rcs) {
+            out_prims.emplace_back(cvt(rc, CV_RGB(0,255,0)));   // green
+        }
+    }
+};
+
+} // namespace custom
+
+int main(int argc, char *argv[]) {
+
+    cv::CommandLineParser cmd(argc, argv, keys);
+    cmd.about(about);
+    if (cmd.has("help")) {
+        cmd.printMessage();
+        return 0;
+    }
+
+    // get file name
+    std::string file_path = cmd.get<std::string>("input");
+    const std::string output = cmd.get<std::string>("output");
+    const auto face_model_path = cmd.get<std::string>("facem");
+
+    // check ouput file extension
+    if (!output.empty()) {
+        auto ext = output.find_last_of(".");
+        if (ext == std::string::npos || (output.substr(ext + 1) != "avi")) {
+            std::cerr << "Output file should have *.avi extension for output video" << std::endl;
+            return -1;
+        }
+    }
+
+    auto face_net = cv::gapi::ie::Params<custom::FaceDetector> {
+        face_model_path,                 // path to topology IR
+        get_weights_path(face_model_path),   // path to weights
+        cmd.get<std::string>("faced"),   // device specifier
+    };
+    auto kernels = cv::gapi::kernels
+        < custom::OCVLocateROI
+        , custom::OCVParseSSD
+        , custom::OCVBBoxes>();
+    auto networks = cv::gapi::networks(face_net);
+
+    // Create source
+    cv::Ptr<cv::gapi::wip::IStreamSource> cap;
+    try {
+        cap = cv::gapi::wip::make_vpl_src(file_path);
+        std::cout << "oneVPL source desription: " << cap->descr_of() << std::endl;
+    } catch (const std::exception& ex) {
+        std::cerr << "Cannot create source: " << ex.what() << std::endl;
+        return -1;
+    }
+
+    cv::GMetaArg descr = cap->descr_of();
+    auto frame_descr = cv::util::get<cv::GFrameDesc>(descr);
+
+    // Now build the graph
+    cv::GFrame in;
+    auto size = cv::gapi::streaming::size(in);
+    auto roi = custom::LocateROI::on(size);
+    auto blob = cv::gapi::infer<custom::FaceDetector>(roi, in);
+    auto rcs = custom::ParseSSD::on(blob, roi, size);
+    auto out_frame = cv::gapi::wip::draw::renderFrame(in, custom::BBoxes::on(rcs, roi));
+    auto out = cv::gapi::streaming::BGR(out_frame);
+
+    cv::GStreamingCompiled pipeline;
+    try {
+        pipeline  = cv::GComputation(cv::GIn(in), cv::GOut(out))
+                .compileStreaming(cv::compile_args(kernels, networks));
+    } catch (const std::exception& ex) {
+        std::cerr << "Exception occured during pipeline construction: " << ex.what() << std::endl;
+        return -1;
+    }
+    // The execution part
+
+    // TODO USE may set pool size from outside and set queue_capacity size,
+    // compile arg: cv::gapi::streaming::queue_capacity
+    pipeline.setSource(std::move(cap));
+    pipeline.start();
+
+    int framesCount = 0;
+    cv::TickMeter t;
+    cv::VideoWriter writer;
+    if (!output.empty() && !writer.isOpened()) {
+        const auto sz = cv::Size{frame_descr.size.width, frame_descr.size.height};
+        writer.open(output, cv::VideoWriter::fourcc('M','J','P','G'), 25.0, sz);
+        CV_Assert(writer.isOpened());
+    }
+
+    cv::Mat outMat;
+    t.start();
+    while (pipeline.pull(cv::gout(outMat))) {
+        cv::imshow("Out", outMat);
+        cv::waitKey(1);
+        if (!output.empty()) {
+            writer << outMat;
+        }
+        framesCount++;
+    }
+    t.stop();
+    std::cout << "Elapsed time: " << t.getTimeSec() << std::endl;
+    std::cout << "FPS: " << framesCount /  t.getTimeSec() << std::endl;
+    std::cout << "framesCount: " << framesCount << std::endl;
+
+    return 0;
+}
diff --git a/modules/gapi/samples/semantic_segmentation.cpp b/modules/gapi/samples/semantic_segmentation.cpp
index 0a6e7231c4f2..4cdb14cc5c1a 100644
--- a/modules/gapi/samples/semantic_segmentation.cpp
+++ b/modules/gapi/samples/semantic_segmentation.cpp
@@ -47,6 +47,53 @@ std::string get_weights_path(const std::string &model_path) {
     CV_Assert(ext == ".xml");
     return model_path.substr(0u, sz - EXT_LEN) + ".bin";
 }
+
+void classesToColors(const cv::Mat &out_blob,
+                           cv::Mat &mask_img) {
+    const int H = out_blob.size[0];
+    const int W = out_blob.size[1];
+
+    mask_img.create(H, W, CV_8UC3);
+    GAPI_Assert(out_blob.type() == CV_8UC1);
+    const uint8_t* const classes = out_blob.ptr<uint8_t>();
+
+    for (int rowId = 0; rowId < H; ++rowId) {
+        for (int colId = 0; colId < W; ++colId) {
+            uint8_t class_id = classes[rowId * W + colId];
+            mask_img.at<cv::Vec3b>(rowId, colId) =
+                class_id < colors.size()
+                ? colors[class_id]
+                : cv::Vec3b{0, 0, 0}; // NB: sample supports 20 classes
+        }
+    }
+}
+
+void probsToClasses(const cv::Mat& probs, cv::Mat& classes) {
+     const int C = probs.size[1];
+     const int H = probs.size[2];
+     const int W = probs.size[3];
+
+     classes.create(H, W, CV_8UC1);
+     GAPI_Assert(probs.depth() == CV_32F);
+     float* out_p       = reinterpret_cast<float*>(probs.data);
+     uint8_t* classes_p = reinterpret_cast<uint8_t*>(classes.data);
+
+     for (int h = 0; h < H; ++h) {
+         for (int w = 0; w < W; ++w) {
+             double max = 0;
+             int class_id = 0;
+             for (int c = 0; c < C; ++c) {
+                int idx = c * H * W + h * W + w;
+                    if (out_p[idx] > max) {
+                        max = out_p[idx];
+                        class_id = c;
+                    }
+             }
+             classes_p[h * W + w] = static_cast<uint8_t>(class_id);
+         }
+     }
+}
+
 } // anonymous namespace
 
 namespace custom {
@@ -57,25 +104,21 @@ G_API_OP(PostProcessing, <cv::GMat(cv::GMat, cv::GMat)>, "sample.custom.post_pro
 };
 
 GAPI_OCV_KERNEL(OCVPostProcessing, PostProcessing) {
-    static void run(const cv::Mat &in, const cv::Mat &detected_classes, cv::Mat &out) {
-        // This kernel constructs output image by class table and colors vector
-
-        // The semantic-segmentation-adas-0001 output a blob with the shape
-        // [B, C=1, H=1024, W=2048]
-        const int outHeight = 1024;
-        const int outWidth = 2048;
-        cv::Mat maskImg(outHeight, outWidth, CV_8UC3);
-        const int* const classes = detected_classes.ptr<int>();
-        for (int rowId = 0; rowId < outHeight; ++rowId) {
-            for (int colId = 0; colId < outWidth; ++colId) {
-                size_t classId = static_cast<size_t>(classes[rowId * outWidth + colId]);
-                maskImg.at<cv::Vec3b>(rowId, colId) =
-                    classId < colors.size()
-                        ? colors[classId]
-                        : cv::Vec3b{0, 0, 0}; // sample detects 20 classes
-            }
+    static void run(const cv::Mat &in, const cv::Mat &out_blob, cv::Mat &out) {
+        cv::Mat classes;
+        // NB: If output has more than single plane, it contains probabilities
+        // otherwise class id.
+        if (out_blob.size[1] > 1) {
+            probsToClasses(out_blob, classes);
+        } else {
+            out_blob.convertTo(classes, CV_8UC1);
+            classes = classes.reshape(1, out_blob.size[2]);
         }
-        cv::resize(maskImg, out, in.size());
+
+        cv::Mat mask_img;
+        classesToColors(classes, mask_img);
+
+        cv::resize(mask_img, out, in.size());
         const float blending = 0.3f;
         out = in * blending + out * (1 - blending);
     }
@@ -104,8 +147,8 @@ int main(int argc, char *argv[]) {
 
     // Now build the graph
     cv::GMat in;
-    cv::GMat detected_classes = cv::gapi::infer<SemSegmNet>(in);
-    cv::GMat out = custom::PostProcessing::on(in, detected_classes);
+    cv::GMat out_blob = cv::gapi::infer<SemSegmNet>(in);
+    cv::GMat out = custom::PostProcessing::on(in, out_blob);
 
     cv::GStreamingCompiled pipeline = cv::GComputation(cv::GIn(in), cv::GOut(out))
         .compileStreaming(cv::compile_args(kernels, networks));
diff --git a/modules/gapi/src/api/ginfer.cpp b/modules/gapi/src/api/ginfer.cpp
index e3cc94041c32..9db05a43c369 100644
--- a/modules/gapi/src/api/ginfer.cpp
+++ b/modules/gapi/src/api/ginfer.cpp
@@ -15,6 +15,10 @@ cv::gapi::GNetPackage::GNetPackage(std::initializer_list<GNetParam> ii)
     : networks(ii) {
 }
 
+cv::gapi::GNetPackage::GNetPackage(std::vector<GNetParam> nets)
+    : networks(nets) {
+}
+
 std::vector<cv::gapi::GBackend> cv::gapi::GNetPackage::backends() const {
     std::unordered_set<cv::gapi::GBackend> unique_set;
     for (const auto &nn : networks) unique_set.insert(nn.backend);
diff --git a/modules/gapi/src/api/gproto.cpp b/modules/gapi/src/api/gproto.cpp
index 94234c9b4d70..9b012770caee 100644
--- a/modules/gapi/src/api/gproto.cpp
+++ b/modules/gapi/src/api/gproto.cpp
@@ -14,7 +14,6 @@
 
 #include "api/gorigin.hpp"
 #include "api/gproto_priv.hpp"
-#include "logger.hpp"
 
 // FIXME: it should be a visitor!
 // FIXME: Reimplement with traits?
@@ -277,13 +276,9 @@ void cv::validate_input_arg(const GRunArg& arg)
 
 void cv::validate_input_args(const GRunArgs& args)
 {
-    GAPI_LOG_DEBUG(nullptr, "Total count: " << args.size());
-    size_t index = 0;
     for (const auto& arg : args)
     {
-        GAPI_LOG_DEBUG(nullptr, "Process index: " << index);
         validate_input_arg(arg);
-        index ++;
     }
 }
 
diff --git a/modules/gapi/src/api/media.cpp b/modules/gapi/src/api/media.cpp
index 884fc9e83d79..b1c455d40aef 100644
--- a/modules/gapi/src/api/media.cpp
+++ b/modules/gapi/src/api/media.cpp
@@ -35,6 +35,10 @@ cv::MediaFrame::IAdapter* cv::MediaFrame::getAdapter() const {
     return m->adapter.get();
 }
 
+void cv::MediaFrame::serialize(cv::gapi::s11n::IOStream& os) const {
+    return m->adapter->serialize(os);
+}
+
 cv::MediaFrame::View::View(Ptrs&& ptrs, Strides&& strs, Callback &&cb)
     : ptr   (std::move(ptrs))
     , stride(std::move(strs))
diff --git a/modules/gapi/src/api/render_ocv.cpp b/modules/gapi/src/api/render_ocv.cpp
index 5ab2e1dd07c2..f1e9be4b4893 100644
--- a/modules/gapi/src/api/render_ocv.cpp
+++ b/modules/gapi/src/api/render_ocv.cpp
@@ -159,7 +159,7 @@ void drawPrimitivesOCV(cv::Mat& in,
             {
                 const auto& rp = cv::util::get<Rect>(p);
                 const auto color = converter.cvtColor(rp.color);
-                cv::rectangle(in, rp.rect, color , rp.thick);
+                cv::rectangle(in, rp.rect, color, rp.thick, rp.lt, rp.shift);
                 break;
             }
 
@@ -198,7 +198,7 @@ void drawPrimitivesOCV(cv::Mat& in,
             {
                 const auto& cp = cv::util::get<Circle>(p);
                 const auto color = converter.cvtColor(cp.color);
-                cv::circle(in, cp.center, cp.radius, color, cp.thick);
+                cv::circle(in, cp.center, cp.radius, color, cp.thick, cp.lt, cp.shift);
                 break;
             }
 
@@ -206,7 +206,7 @@ void drawPrimitivesOCV(cv::Mat& in,
             {
                 const auto& lp = cv::util::get<Line>(p);
                 const auto color = converter.cvtColor(lp.color);
-                cv::line(in, lp.pt1, lp.pt2, color, lp.thick);
+                cv::line(in, lp.pt1, lp.pt2, color, lp.thick, lp.lt, lp.shift);
                 break;
             }
 
diff --git a/modules/gapi/src/api/s11n.cpp b/modules/gapi/src/api/s11n.cpp
index d08f47fd26a7..bd7f46c88aec 100644
--- a/modules/gapi/src/api/s11n.cpp
+++ b/modules/gapi/src/api/s11n.cpp
@@ -65,25 +65,25 @@ std::vector<char> cv::gapi::serialize(const std::vector<std::string>& vs)
 
 // FIXME: This function should move from S11N to GRunArg-related entities.
 // it has nothing to do with the S11N as it is
-cv::GRunArgsP cv::gapi::bind(cv::GRunArgs &results)
+cv::GRunArgsP cv::gapi::bind(cv::GRunArgs &out_args)
 {
     cv::GRunArgsP outputs;
-    outputs.reserve(results.size());
-    for (cv::GRunArg &res_obj : results)
+    outputs.reserve(out_args.size());
+    for (cv::GRunArg &res_obj : out_args)
     {
         using T = cv::GRunArg;
         switch (res_obj.index())
         {
 #if !defined(GAPI_STANDALONE)
         case T::index_of<cv::UMat>() :
-            outputs.emplace_back((cv::UMat*)(&(cv::util::get<cv::UMat>(res_obj))));
+            outputs.emplace_back(&(cv::util::get<cv::UMat>(res_obj)));
             break;
 #endif
         case cv::GRunArg::index_of<cv::Mat>() :
-            outputs.emplace_back((cv::Mat*)(&(cv::util::get<cv::Mat>(res_obj))));
+            outputs.emplace_back(&(cv::util::get<cv::Mat>(res_obj)));
             break;
         case cv::GRunArg::index_of<cv::Scalar>() :
-            outputs.emplace_back((cv::Scalar*)(&(cv::util::get<cv::Scalar>(res_obj))));
+            outputs.emplace_back(&(cv::util::get<cv::Scalar>(res_obj)));
             break;
         case T::index_of<cv::detail::VectorRef>() :
             outputs.emplace_back(cv::util::get<cv::detail::VectorRef>(res_obj));
@@ -92,7 +92,10 @@ cv::GRunArgsP cv::gapi::bind(cv::GRunArgs &results)
             outputs.emplace_back(cv::util::get<cv::detail::OpaqueRef>(res_obj));
             break;
         case cv::GRunArg::index_of<cv::RMat>() :
-            outputs.emplace_back((cv::RMat*)(&(cv::util::get<cv::RMat>(res_obj))));
+            outputs.emplace_back(&(cv::util::get<cv::RMat>(res_obj)));
+            break;
+        case cv::GRunArg::index_of<cv::MediaFrame>() :
+            outputs.emplace_back(&(cv::util::get<cv::MediaFrame>(res_obj)));
             break;
         default:
             GAPI_Assert(false && "This value type is not supported!"); // ...maybe because of STANDALONE mode.
@@ -130,6 +133,9 @@ cv::GRunArg cv::gapi::bind(cv::GRunArgP &out)
     case T::index_of<cv::RMat*>() :
         return cv::GRunArg(*cv::util::get<cv::RMat*>(out));
 
+    case T::index_of<cv::MediaFrame*>() :
+        return cv::GRunArg(*cv::util::get<cv::MediaFrame*>(out));
+
     default:
         // ...maybe our types were extended
         GAPI_Assert(false && "This value type is UNKNOWN!");
diff --git a/modules/gapi/src/backends/common/gmetabackend.cpp b/modules/gapi/src/backends/common/gmetabackend.cpp
index c535569b0cef..40e87c3ea0aa 100644
--- a/modules/gapi/src/backends/common/gmetabackend.cpp
+++ b/modules/gapi/src/backends/common/gmetabackend.cpp
@@ -85,6 +85,19 @@ class GGraphMetaBackendImpl final: public cv::gapi::GBackend::Priv {
                          const std::vector<cv::gimpl::Data>&) const override {
         return EPtr{new GraphMetaExecutable(graph, nodes)};
     }
+
+    virtual bool controlsMerge() const override
+    {
+        return true;
+    }
+
+    virtual bool allowsMerge(const cv::gimpl::GIslandModel::Graph &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &,
+                             const ade::NodeHandle &) const override
+    {
+        return false;
+    }
 };
 
 cv::gapi::GBackend graph_meta_backend() {
diff --git a/modules/gapi/src/backends/common/serialization.cpp b/modules/gapi/src/backends/common/serialization.cpp
index 7389bacb02f0..619b2feb7417 100644
--- a/modules/gapi/src/backends/common/serialization.cpp
+++ b/modules/gapi/src/backends/common/serialization.cpp
@@ -32,6 +32,14 @@ void putData(GSerialized& s, const cv::gimpl::GModel::ConstGraph& cg, const ade:
         });
     if (s.m_datas.end() == it) {
         s.m_datas.push_back(gdata);
+
+        if (cg.metadata(nh).contains<gimpl::ConstValue>()) {
+            size_t datas_num = s.m_datas.size() - 1;
+            GAPI_DbgAssert(datas_num <= static_cast<size_t>(std::numeric_limits<GSerialized::data_tag_t>::max()));
+            GSerialized::data_tag_t tag = static_cast<GSerialized::data_tag_t>(datas_num);
+            s.m_const_datas.emplace(tag,
+                                    cg.metadata(nh).get<gimpl::ConstValue>());
+        }
     }
 }
 
@@ -42,11 +50,20 @@ void putOp(GSerialized& s, const cv::gimpl::GModel::ConstGraph& cg, const ade::N
     s.m_ops.push_back(op);
 }
 
-void mkDataNode(ade::Graph& g, const cv::gimpl::Data& data) {
+ade::NodeHandle mkDataNode(ade::Graph& g, const cv::gimpl::Data& data) {
     cv::gimpl::GModel::Graph gm(g);
     auto nh = gm.createNode();
     gm.metadata(nh).set(cv::gimpl::NodeType{cv::gimpl::NodeType::DATA});
     gm.metadata(nh).set(data);
+    return nh;
+}
+
+ade::NodeHandle mkConstDataNode(ade::Graph& g, const cv::gimpl::Data& data, const cv::gimpl::ConstValue& const_data) {
+    auto nh = mkDataNode(g, data);
+
+    cv::gimpl::GModel::Graph gm(g);
+    gm.metadata(nh).set(const_data);
+    return nh;
 }
 
 void mkOpNode(ade::Graph& g, const cv::gimpl::Op& op) {
@@ -184,18 +201,20 @@ IOStream& operator<< (IOStream& os, const cv::RMat& mat) {
     return os;
 }
 IIStream& operator>> (IIStream& is, cv::RMat&) {
-    util::throw_error(std::logic_error("operator>> for RMat should never be called"));
+    util::throw_error(std::logic_error("operator>> for RMat should never be called. "
+                                        "Instead, cv::gapi::deserialize<cv::GRunArgs, AdapterTypes...>() "
+                                        "should be used"));
     return is;
 }
 
-IOStream& operator<< (IOStream& os, const cv::MediaFrame &) {
-    // Stub
-    GAPI_Assert(false && "cv::MediaFrame serialization is not supported!");
+IOStream& operator<< (IOStream& os, const cv::MediaFrame &frame) {
+    frame.serialize(os);
     return os;
 }
 IIStream& operator>> (IIStream& is, cv::MediaFrame &) {
-    // Stub
-    GAPI_Assert(false && "cv::MediaFrame serialization is not supported!");
+    util::throw_error(std::logic_error("operator>> for MediaFrame should never be called. "
+                                        "Instead, cv::gapi::deserialize<cv::GRunArgs, AdapterTypes...>() "
+                                        "should be used"));
     return is;
 }
 
@@ -624,6 +643,10 @@ IOStream& operator<< (IOStream& os, const cv::gimpl::Data &d) {
     return os << d.shape << d.rc << d.meta << d.storage << d.kind;
 }
 
+IOStream& operator<< (IOStream& os, const cv::gimpl::ConstValue &cd) {
+    return os << cd.arg;
+}
+
 namespace
 {
 template<typename Ref, typename T, typename... Ts>
@@ -667,6 +690,9 @@ IIStream& operator>> (IIStream& is, cv::gimpl::Data &d) {
     return is;
 }
 
+IIStream& operator>> (IIStream& is, cv::gimpl::ConstValue &cd) {
+    return is >> cd.arg;
+}
 
 IOStream& operator<< (IOStream& os, const cv::gimpl::DataObjectCounter &c) {
     return os << c.m_next_data_id;
@@ -709,18 +735,34 @@ void serialize( IOStream& os
     }
     s.m_counter = cg.metadata().get<cv::gimpl::DataObjectCounter>();
     s.m_proto   = p;
-    os << s.m_ops << s.m_datas << s.m_counter << s.m_proto;
+    os << s.m_ops << s.m_datas << s.m_counter << s.m_proto << s.m_const_datas;
 }
 
 GSerialized deserialize(IIStream &is) {
     GSerialized s;
-    is >> s.m_ops >> s.m_datas >> s.m_counter >> s.m_proto;
+    is >> s.m_ops >> s.m_datas >> s.m_counter >> s.m_proto >> s.m_const_datas;
     return s;
 }
 
 void reconstruct(const GSerialized &s, ade::Graph &g) {
     GAPI_Assert(g.nodes().empty());
-    for (const auto& d  : s.m_datas) cv::gapi::s11n::mkDataNode(g, d);
+
+    GSerialized::data_tag_t tag = 0;
+    for (const auto& d  : s.m_datas) {
+        if (d.storage == gimpl::Data::Storage::CONST_VAL) {
+            auto cit = s.m_const_datas.find(tag);
+            if (cit == s.m_const_datas.end()) {
+                util::throw_error(std::logic_error("Cannot reconstruct graph: Data::Storage::CONST_VAL by tag: " +
+                                  std::to_string(tag) + " requires ConstValue"));
+            }
+
+            mkConstDataNode(g, d, cit->second);
+        } else {
+            cv::gapi::s11n::mkDataNode(g, d);
+        }
+
+        tag ++;
+    }
     for (const auto& op : s.m_ops)   cv::gapi::s11n::mkOpNode(g, op);
     cv::gapi::s11n::linkNodes(g);
 
diff --git a/modules/gapi/src/backends/common/serialization.hpp b/modules/gapi/src/backends/common/serialization.hpp
index b4204ca64e38..529fdc635d5e 100644
--- a/modules/gapi/src/backends/common/serialization.hpp
+++ b/modules/gapi/src/backends/common/serialization.hpp
@@ -31,6 +31,9 @@ struct GSerialized {
     std::vector<cv::gimpl::Data> m_datas;
     cv::gimpl::DataObjectCounter m_counter;
     cv::gimpl::Protocol m_proto;
+
+    using data_tag_t = uint64_t;
+    std::map<data_tag_t, cv::gimpl::ConstValue> m_const_datas;
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -97,6 +100,9 @@ GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::gimpl::Op &op);
 GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gimpl::Data &op);
 GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::gimpl::Data &op);
 
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gimpl::ConstValue &cd);
+GAPI_EXPORTS IIStream& operator>> (IIStream& os, cv::gimpl::ConstValue &cd);
+
 // Render types ////////////////////////////////////////////////////////////////
 
 GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::gapi::wip::draw::Text &t);
diff --git a/modules/gapi/src/backends/ie/bindings_ie.cpp b/modules/gapi/src/backends/ie/bindings_ie.cpp
index 35191d7bcb53..5874fe137864 100644
--- a/modules/gapi/src/backends/ie/bindings_ie.cpp
+++ b/modules/gapi/src/backends/ie/bindings_ie.cpp
@@ -37,3 +37,15 @@ cv::gapi::ie::PyParams cv::gapi::ie::params(const std::string &tag,
                                             const std::string &device) {
     return {tag, model, device};
 }
+
+cv::gapi::ie::PyParams& cv::gapi::ie::PyParams::constInput(const std::string &layer_name,
+                                                           const cv::Mat &data,
+                                                           TraitAs hint) {
+    m_priv->constInput(layer_name, data, hint);
+    return *this;
+}
+
+cv::gapi::ie::PyParams& cv::gapi::ie::PyParams::cfgNumRequests(size_t nireq) {
+    m_priv->cfgNumRequests(nireq);
+    return *this;
+}
diff --git a/modules/gapi/src/backends/ie/giebackend.cpp b/modules/gapi/src/backends/ie/giebackend.cpp
index 46b6bdbb97ab..007f0db7afcc 100644
--- a/modules/gapi/src/backends/ie/giebackend.cpp
+++ b/modules/gapi/src/backends/ie/giebackend.cpp
@@ -108,6 +108,7 @@ inline IE::Precision toIE(int depth) {
     case CV_8U:  return IE::Precision::U8;
     case CV_32S: return IE::Precision::I32;
     case CV_32F: return IE::Precision::FP32;
+    case CV_16F: return IE::Precision::FP16;
     default:     GAPI_Assert(false && "IE. Unsupported data type");
     }
     return IE::Precision::UNSPECIFIED;
@@ -161,6 +162,7 @@ inline IE::Blob::Ptr wrapIE(const cv::Mat &mat, cv::gapi::ie::TraitAs hint) {
         HANDLE(8U, uint8_t);
         HANDLE(32F, float);
         HANDLE(32S, int);
+        HANDLE(16F, int16_t);
 #undef HANDLE
     default: GAPI_Assert(false && "IE. Unsupported data type");
     }
@@ -222,8 +224,17 @@ struct IEUnit {
     IE::ExecutableNetwork this_network;
     cv::gimpl::ie::wrap::Plugin this_plugin;
 
+    InferenceEngine::RemoteContext::Ptr rctx = nullptr;
+
     explicit IEUnit(const cv::gapi::ie::detail::ParamDesc &pp)
         : params(pp) {
+        InferenceEngine::ParamMap* ctx_params =
+                            cv::util::any_cast<InferenceEngine::ParamMap>(&params.context_config);
+        if (ctx_params != nullptr) {
+            auto ie_core = cv::gimpl::ie::wrap::getCore();
+            rctx = ie_core.CreateContext(params.device_id, *ctx_params);
+        }
+
         if (params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
             net = cv::gimpl::ie::wrap::readNetwork(params);
             inputs  = net.getInputsInfo();
@@ -231,11 +242,7 @@ struct IEUnit {
         } else if (params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Import) {
             this_plugin = cv::gimpl::ie::wrap::getPlugin(params);
             this_plugin.SetConfig(params.config);
-            this_network = cv::gimpl::ie::wrap::importNetwork(this_plugin, params);
-            // FIXME: ICNNetwork returns InputsDataMap/OutputsDataMap,
-            // but ExecutableNetwork returns ConstInputsDataMap/ConstOutputsDataMap
-            inputs  = cv::gimpl::ie::wrap::toInputsDataMap(this_network.GetInputsInfo());
-            outputs = cv::gimpl::ie::wrap::toOutputsDataMap(this_network.GetOutputsInfo());
+            this_network = cv::gimpl::ie::wrap::importNetwork(this_plugin, params, rctx);
             if (!params.reshape_table.empty() || !params.layer_names_to_reshape.empty()) {
                 GAPI_LOG_WARNING(NULL, "Reshape isn't supported for imported network");
             }
@@ -259,10 +266,18 @@ struct IEUnit {
                                                    + params.model_path));
         }
         if (params.num_in == 1u && params.input_names.empty()) {
-            params.input_names = { inputs.begin()->first };
+            if (params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+                params.input_names = { inputs.begin()->first };
+            } else {
+                params.input_names = { this_network.GetInputsInfo().begin()->first };
+            }
         }
         if (params.num_out == 1u && params.output_names.empty()) {
-            params.output_names = { outputs.begin()->first };
+            if (params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+                params.output_names = { outputs.begin()->first };
+            } else {
+                params.output_names = { this_network.GetOutputsInfo().begin()->first };
+            }
         }
         if (!params.reshape_table.empty()) {
             GAPI_Assert((params.reshape_table.size() + params.layer_names_to_reshape.size()) <=
@@ -279,7 +294,8 @@ struct IEUnit {
             // for loadNetwork they can be obtained by using readNetwork
             non_const_this->this_plugin  = cv::gimpl::ie::wrap::getPlugin(params);
             non_const_this->this_plugin.SetConfig(params.config);
-            non_const_this->this_network = cv::gimpl::ie::wrap::loadNetwork(non_const_this->this_plugin, net, params);
+            non_const_this->this_network = cv::gimpl::ie::wrap::loadNetwork(non_const_this->this_plugin,
+                                                                            net, params, rctx);
         }
 
         return {params, this_plugin, this_network};
@@ -481,7 +497,32 @@ using GConstGIEModel = ade::ConstTypedGraph
     , IECallable
     >;
 
+inline IE::Blob::Ptr extractRemoteBlob(IECallContext& ctx, std::size_t i) {
+    GAPI_Assert(ctx.inShape(i) == cv::GShape::GFRAME &&
+                "Remote blob is supported for MediaFrame only");
+
+    cv::util::any any_blob_params = ctx.inFrame(i).blobParams();
+    auto ie_core = cv::gimpl::ie::wrap::getCore();
+
+    using ParamType = std::pair<InferenceEngine::TensorDesc,
+                                InferenceEngine::ParamMap>;
+
+    ParamType* blob_params = cv::util::any_cast<ParamType>(&any_blob_params);
+    if (blob_params == nullptr) {
+        GAPI_Assert(false && "Incorrect type of blobParams: "
+                              "expected std::pair<InferenceEngine::TensorDesc,"
+                                                 "InferenceEngine::ParamMap>");
+    }
+
+    return ctx.uu.rctx->CreateBlob(blob_params->first,
+                                   blob_params->second);
+}
+
 inline IE::Blob::Ptr extractBlob(IECallContext& ctx, std::size_t i) {
+    if (ctx.uu.rctx != nullptr) {
+        return extractRemoteBlob(ctx, i);
+    }
+
     switch (ctx.inShape(i)) {
         case cv::GShape::GFRAME: {
             const auto& frame = ctx.inFrame(i);
@@ -496,6 +537,24 @@ inline IE::Blob::Ptr extractBlob(IECallContext& ctx, std::size_t i) {
     }
     GAPI_Assert(false);
 }
+
+
+static void setBlob(InferenceEngine::InferRequest&        req,
+                    cv::gapi::ie::detail::ParamDesc::Kind kind,
+                    const std::string&                    layer_name,
+                    IE::Blob::Ptr                         blob) {
+    // NB: In case importNetwork preprocessing must be
+    // passed as SetBlob argument.
+    if (kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+        req.SetBlob(layer_name, blob);
+    } else {
+        GAPI_Assert(kind == cv::gapi::ie::detail::ParamDesc::Kind::Import);
+        IE::PreProcessInfo info;
+        info.setResizeAlgorithm(IE::RESIZE_BILINEAR);
+        req.SetBlob(layer_name, blob, info);
+    }
+}
+
 } // anonymous namespace
 
 std::vector<InferenceEngine::InferRequest> cv::gimpl::ie::IECompiled::createInferRequests() {
@@ -854,25 +913,30 @@ struct Infer: public cv::detail::KernelTag {
         // meta order.
         GAPI_Assert(uu.params.input_names.size() == in_metas.size()
                     && "Known input layers count doesn't match input meta count");
-        for (auto &&it : ade::util::zip(ade::util::toRange(uu.params.input_names),
-                                        ade::util::toRange(in_metas))) {
-            const auto &input_name = std::get<0>(it);
-            auto       &&ii = uu.inputs.at(input_name);
-            const auto & mm = std::get<1>(it);
 
-            configureInputInfo(ii, mm);
-            if (uu.params.layer_names_to_reshape.find(input_name) !=
-                uu.params.layer_names_to_reshape.end()) {
-                configureInputReshapeByImage(ii, mm, input_reshape_table);
+        // NB: Configuring input precision and network reshape must be done
+        // only in the loadNetwork case.
+        if (uu.params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+            for (auto &&it : ade::util::zip(ade::util::toRange(uu.params.input_names),
+                                            ade::util::toRange(in_metas))) {
+                    const auto &input_name = std::get<0>(it);
+                    auto       &&ii = uu.inputs.at(input_name);
+                    const auto & mm = std::get<1>(it);
+
+                    configureInputInfo(ii, mm);
+                    if (uu.params.layer_names_to_reshape.find(input_name) !=
+                        uu.params.layer_names_to_reshape.end()) {
+                        configureInputReshapeByImage(ii, mm, input_reshape_table);
+                    }
+                    ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
             }
-            ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
-        }
 
-        // FIXME: This isn't the best place to call reshape function.
-        // Сorrect solution would be to do this in compile() method of network,
-        // but now input meta isn't passed to compile() method.
-        if (!input_reshape_table.empty()) {
-            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            // FIXME: This isn't the best place to call reshape function.
+            // Сorrect solution would be to do this in compile() method of network,
+            // but now input meta isn't passed to compile() method.
+            if (!input_reshape_table.empty()) {
+                const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            }
         }
 
         // FIXME: It would be nice here to have an exact number of network's
@@ -904,7 +968,10 @@ struct Infer: public cv::detail::KernelTag {
                             // and redirect our data producers to this memory
                             // (A memory dialog comes to the picture again)
                             IE::Blob::Ptr this_blob = extractBlob(*ctx, i);
-                            req.SetBlob(ctx->uu.params.input_names[i], this_blob);
+                            setBlob(req,
+                                    ctx->uu.params.kind,
+                                    ctx->uu.params.input_names[i],
+                                    this_blob);
                         }
                         // FIXME: Should it be done by kernel ?
                         // What about to do that in RequestPool ?
@@ -936,22 +1003,26 @@ struct InferROI: public cv::detail::KernelTag {
         GAPI_Assert(1u == uu.params.input_names.size());
         GAPI_Assert(2u == in_metas.size());
 
-        // 0th is ROI, 1st is input image
-        const auto &input_name = uu.params.input_names.at(0);
-        auto &&ii = uu.inputs.at(input_name);
-        auto &&mm = in_metas.at(1u);
-        configureInputInfo(ii, mm);
-        if (uu.params.layer_names_to_reshape.find(input_name) !=
-            uu.params.layer_names_to_reshape.end()) {
-            configureInputReshapeByImage(ii, mm, input_reshape_table);
-        }
-        ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
+        // NB: Configuring input precision and network reshape must be done
+        // only in the loadNetwork case.
+        if (uu.params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+            // 0th is ROI, 1st is input image
+            const auto &input_name = uu.params.input_names.at(0);
+            auto &&ii = uu.inputs.at(input_name);
+            auto &&mm = in_metas.at(1u);
+            configureInputInfo(ii, mm);
+            if (uu.params.layer_names_to_reshape.find(input_name) !=
+                uu.params.layer_names_to_reshape.end()) {
+                configureInputReshapeByImage(ii, mm, input_reshape_table);
+            }
+            ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
 
-        // FIXME: This isn't the best place to call reshape function.
-        // Сorrect solution would be to do this in compile() method of network,
-        // but now input meta isn't passed to compile() method.
-        if (!input_reshape_table.empty()) {
-            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            // FIXME: This isn't the best place to call reshape function.
+            // Сorrect solution would be to do this in compile() method of network,
+            // but now input meta isn't passed to compile() method.
+            if (!input_reshape_table.empty()) {
+                const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            }
         }
 
         // FIXME: It would be nice here to have an exact number of network's
@@ -980,10 +1051,11 @@ struct InferROI: public cv::detail::KernelTag {
                         auto&& this_roi = ctx->inArg<cv::detail::OpaqueRef>(0).rref<cv::Rect>();
 
                         IE::Blob::Ptr this_blob = extractBlob(*ctx, 1);
-
-                        req.SetBlob(*(ctx->uu.params.input_names.begin()),
-                                IE::make_shared_blob(this_blob, toIE(this_roi)));
-
+                        setBlob(req,
+                                ctx->uu.params.kind,
+                                *(ctx->uu.params.input_names.begin()),
+                                IE::make_shared_blob(this_blob,
+                                                     toIE(this_roi)));
                         // FIXME: Should it be done by kernel ?
                         // What about to do that in RequestPool ?
                         req.StartAsync();
@@ -1018,23 +1090,27 @@ struct InferList: public cv::detail::KernelTag {
         GAPI_Assert(uu.params.input_names.size() == (in_metas.size() - 1u)
                     && "Known input layers count doesn't match input meta count");
 
-        std::size_t idx = 1u;
-        for (auto &&input_name : uu.params.input_names) {
-            auto       &&ii = uu.inputs.at(input_name);
-            const auto & mm = in_metas[idx++];
-            configureInputInfo(ii, mm);
-            if (uu.params.layer_names_to_reshape.find(input_name) !=
-                uu.params.layer_names_to_reshape.end()) {
-                configureInputReshapeByImage(ii, mm, input_reshape_table);
+        // NB: Configuring input precision and network reshape must be done
+        // only in the loadNetwork case.
+        if (uu.params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+            std::size_t idx = 1u;
+            for (auto &&input_name : uu.params.input_names) {
+                auto       &&ii = uu.inputs.at(input_name);
+                const auto & mm = in_metas[idx++];
+                configureInputInfo(ii, mm);
+                if (uu.params.layer_names_to_reshape.find(input_name) !=
+                    uu.params.layer_names_to_reshape.end()) {
+                    configureInputReshapeByImage(ii, mm, input_reshape_table);
+                }
+                ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
             }
-            ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
-        }
 
-        // FIXME: This isn't the best place to call reshape function.
-        // Сorrect solution would be to do this in compile() method of network,
-        // but now input meta isn't passed to compile() method.
-        if (!input_reshape_table.empty()) {
-            const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            // FIXME: This isn't the best place to call reshape function.
+            // Сorrect solution would be to do this in compile() method of network,
+            // but now input meta isn't passed to compile() method.
+            if (!input_reshape_table.empty()) {
+                const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+            }
         }
 
         // roi-list version is much easier at the moment.
@@ -1060,6 +1136,7 @@ struct InferList: public cv::detail::KernelTag {
         }
 
         IE::Blob::Ptr this_blob = extractBlob(*ctx, 1);
+
         std::vector<std::vector<int>> cached_dims(ctx->uu.params.num_out);
         for (auto i : ade::util::iota(ctx->uu.params.num_out)) {
             const IE::DataPtr& ie_out = ctx->uu.outputs.at(ctx->uu.params.output_names[i]);
@@ -1079,7 +1156,10 @@ struct InferList: public cv::detail::KernelTag {
                 cv::gimpl::ie::RequestPool::Task {
                     [ctx, rc, this_blob](InferenceEngine::InferRequest &req) {
                         IE::Blob::Ptr roi_blob = IE::make_shared_blob(this_blob, toIE(rc));
-                        req.SetBlob(ctx->uu.params.input_names[0u], roi_blob);
+                        setBlob(req,
+                                ctx->uu.params.kind,
+                                ctx->uu.params.input_names[0u],
+                                roi_blob);
                         req.StartAsync();
                     },
                     std::bind(callback, std::placeholders::_1, pos)
@@ -1153,19 +1233,23 @@ struct InferList2: public cv::detail::KernelTag {
                         && "Non-array inputs are not supported");
 
             if (op.k.inKinds[idx] == cv::detail::OpaqueKind::CV_RECT) {
-                // This is a cv::Rect -- configure the IE preprocessing
-                configureInputInfo(ii, mm_0);
-                if (uu.params.layer_names_to_reshape.find(input_name) !=
-                    uu.params.layer_names_to_reshape.end()) {
-                    configureInputReshapeByImage(ii, mm_0, input_reshape_table);
-                }
-                ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
-
-                // FIXME: This isn't the best place to call reshape function.
-                // Сorrect solution would be to do this in compile() method of network,
-                // but now input meta isn't passed to compile() method.
-                if (!input_reshape_table.empty()) {
-                    const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+                // NB: Configuring input precision and network reshape must be done
+                // only in the loadNetwork case.
+                if (uu.params.kind == cv::gapi::ie::detail::ParamDesc::Kind::Load) {
+                    // This is a cv::Rect -- configure the IE preprocessing
+                    configureInputInfo(ii, mm_0);
+                    if (uu.params.layer_names_to_reshape.find(input_name) !=
+                        uu.params.layer_names_to_reshape.end()) {
+                        configureInputReshapeByImage(ii, mm_0, input_reshape_table);
+                    }
+                    ii->getPreProcess().setResizeAlgorithm(IE::RESIZE_BILINEAR);
+
+                    // FIXME: This isn't the best place to call reshape function.
+                    // Сorrect solution would be to do this in compile() method of network,
+                    // but now input meta isn't passed to compile() method.
+                    if (!input_reshape_table.empty()) {
+                        const_cast<IE::CNNNetwork *>(&uu.net)->reshape(input_reshape_table);
+                    }
                 }
             } else {
                 // This is a cv::GMat (equals to: cv::Mat)
@@ -1230,8 +1314,10 @@ struct InferList2: public cv::detail::KernelTag {
                                 GAPI_Assert(false &&
                                         "Only Rect and Mat types are supported for infer list 2!");
                             }
-
-                            req.SetBlob(ctx->uu.params.input_names[in_idx], this_blob);
+                            setBlob(req,
+                                    ctx->uu.params.kind,
+                                    ctx->uu.params.input_names[in_idx],
+                                    this_blob);
                         }
                         req.StartAsync();
                     },
diff --git a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
index ba0632d4f0f2..1f9721dbf4ef 100644
--- a/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
+++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.cpp
@@ -22,24 +22,6 @@ namespace IE = InferenceEngine;
 namespace giewrap = cv::gimpl::ie::wrap;
 using GIEParam = cv::gapi::ie::detail::ParamDesc;
 
-IE::InputsDataMap giewrap::toInputsDataMap (const IE::ConstInputsDataMap& inputs) {
-    IE::InputsDataMap transformed;
-    auto convert = [](const std::pair<std::string, IE::InputInfo::CPtr>& p) {
-        return std::make_pair(p.first, std::const_pointer_cast<IE::InputInfo>(p.second));
-    };
-    std::transform(inputs.begin(), inputs.end(), std::inserter(transformed, transformed.end()), convert);
-    return transformed;
-}
-
-IE::OutputsDataMap giewrap::toOutputsDataMap (const IE::ConstOutputsDataMap& outputs) {
-    IE::OutputsDataMap transformed;
-    auto convert = [](const std::pair<std::string, IE::CDataPtr>& p) {
-        return std::make_pair(p.first, std::const_pointer_cast<IE::Data>(p.second));
-    };
-    std::transform(outputs.begin(), outputs.end(), std::inserter(transformed, transformed.end()), convert);
-    return transformed;
-}
-
 #if INF_ENGINE_RELEASE < 2020000000  // < 2020.1
 // Load extensions (taken from DNN module)
 std::vector<std::string> giewrap::getExtensions(const GIEParam& params) {
@@ -124,7 +106,11 @@ IE::Core giewrap::getPlugin(const GIEParam& params) {
         {
             try
             {
+#if INF_ENGINE_RELEASE >= 2021040000
+                plugin.AddExtension(std::make_shared<IE::Extension>(extlib), params.device_id);
+#else
                 plugin.AddExtension(IE::make_so_pointer<IE::IExtension>(extlib), params.device_id);
+#endif
                 CV_LOG_INFO(NULL, "DNN-IE: Loaded extension plugin: " << extlib);
                 break;
             }
diff --git a/modules/gapi/src/backends/ie/giebackend/giewrapper.hpp b/modules/gapi/src/backends/ie/giebackend/giewrapper.hpp
index 3927c802b713..2e4bac12704a 100644
--- a/modules/gapi/src/backends/ie/giebackend/giewrapper.hpp
+++ b/modules/gapi/src/backends/ie/giebackend/giewrapper.hpp
@@ -13,6 +13,7 @@
 
 #include <vector>
 #include <string>
+#include <fstream>
 
 #include "opencv2/gapi/infer/ie.hpp"
 
@@ -28,9 +29,6 @@ namespace wrap {
 GAPI_EXPORTS std::vector<std::string> getExtensions(const GIEParam& params);
 GAPI_EXPORTS IE::CNNNetwork readNetwork(const GIEParam& params);
 
-IE::InputsDataMap  toInputsDataMap (const IE::ConstInputsDataMap& inputs);
-IE::OutputsDataMap toOutputsDataMap(const IE::ConstOutputsDataMap& outputs);
-
 #if INF_ENGINE_RELEASE < 2019020000  // < 2019.R2
 using Plugin = IE::InferencePlugin;
 GAPI_EXPORTS IE::InferencePlugin getPlugin(const GIEParam& params);
@@ -50,12 +48,29 @@ GAPI_EXPORTS IE::Core getCore();
 GAPI_EXPORTS IE::Core getPlugin(const GIEParam& params);
 GAPI_EXPORTS inline IE::ExecutableNetwork loadNetwork(      IE::Core&       core,
                                                       const IE::CNNNetwork& net,
-                                                      const GIEParam& params) {
-    return core.LoadNetwork(net, params.device_id);
+                                                      const GIEParam& params,
+                                                      IE::RemoteContext::Ptr rctx = nullptr) {
+    if (rctx != nullptr) {
+        return core.LoadNetwork(net, rctx);
+    } else {
+        return core.LoadNetwork(net, params.device_id);
+    }
 }
 GAPI_EXPORTS inline IE::ExecutableNetwork importNetwork(      IE::Core& core,
-                                                        const GIEParam& param) {
-    return core.ImportNetwork(param.model_path, param.device_id, {});
+                                                        const GIEParam& params,
+                                                        IE::RemoteContext::Ptr rctx = nullptr) {
+    if (rctx != nullptr) {
+        std::filebuf blobFile;
+        if (!blobFile.open(params.model_path, std::ios::in | std::ios::binary))
+        {
+            blobFile.close();
+            throw std::runtime_error("Could not open file");
+        }
+        std::istream graphBlob(&blobFile);
+        return core.ImportNetwork(graphBlob, rctx);
+    } else {
+        return core.ImportNetwork(params.model_path, params.device_id, {});
+    }
 }
 #endif // INF_ENGINE_RELEASE < 2019020000
 }}}}
diff --git a/modules/gapi/src/compiler/gstreaming.cpp b/modules/gapi/src/compiler/gstreaming.cpp
index 3bdc0323b5c7..e45e77042755 100644
--- a/modules/gapi/src/compiler/gstreaming.cpp
+++ b/modules/gapi/src/compiler/gstreaming.cpp
@@ -75,6 +75,11 @@ bool cv::GStreamingCompiled::Priv::pull(cv::GOptRunArgsP &&outs)
     return m_exec->pull(std::move(outs));
 }
 
+std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> cv::GStreamingCompiled::Priv::pull()
+{
+    return m_exec->pull();
+}
+
 bool cv::GStreamingCompiled::Priv::try_pull(cv::GRunArgsP &&outs)
 {
     return m_exec->try_pull(std::move(outs));
@@ -123,18 +128,9 @@ bool cv::GStreamingCompiled::pull(cv::GRunArgsP &&outs)
     return m_priv->pull(std::move(outs));
 }
 
-std::tuple<bool, cv::GRunArgs> cv::GStreamingCompiled::pull()
+std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> cv::GStreamingCompiled::pull()
 {
-    GRunArgs run_args;
-    GRunArgsP outs;
-    const auto& out_info = m_priv->outInfo();
-    run_args.reserve(out_info.size());
-    outs.reserve(out_info.size());
-
-    cv::detail::constructGraphOutputs(m_priv->outInfo(), run_args, outs);
-
-    bool is_over = m_priv->pull(std::move(outs));
-    return std::make_tuple(is_over, run_args);
+    return m_priv->pull();
 }
 
 bool cv::GStreamingCompiled::pull(cv::GOptRunArgsP &&outs)
diff --git a/modules/gapi/src/compiler/gstreaming_priv.hpp b/modules/gapi/src/compiler/gstreaming_priv.hpp
index 59b19d425261..1b559ba31030 100644
--- a/modules/gapi/src/compiler/gstreaming_priv.hpp
+++ b/modules/gapi/src/compiler/gstreaming_priv.hpp
@@ -46,6 +46,7 @@ class GAPI_EXPORTS GStreamingCompiled::Priv
     void start();
     bool pull(cv::GRunArgsP &&outs);
     bool pull(cv::GOptRunArgsP &&outs);
+    std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> pull();
     bool try_pull(cv::GRunArgsP &&outs);
     void stop();
 
diff --git a/modules/gapi/src/executor/gexecutor.cpp b/modules/gapi/src/executor/gexecutor.cpp
index 6f313197ba36..9b51e70d5dae 100644
--- a/modules/gapi/src/executor/gexecutor.cpp
+++ b/modules/gapi/src/executor/gexecutor.cpp
@@ -159,8 +159,8 @@ void writeBackExec(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg)
         // a real copy (add a pass to StreamingBackend?)
         auto& out_mat = *util::get<cv::Mat*>(g_arg);
         const auto& rmat = mag.template slot<cv::RMat>().at(rc.id);
-        auto mag_data = rmat.get<RMatAdapter>()->data();
-        if (out_mat.data != mag_data) {
+        auto* adapter = rmat.get<RMatAdapter>();
+        if (adapter != nullptr && out_mat.data != adapter->data()) {
             auto view = rmat.access(RMat::Access::R);
             asMat(view).copyTo(out_mat);
         }
diff --git a/modules/gapi/src/executor/gstreamingexecutor.cpp b/modules/gapi/src/executor/gstreamingexecutor.cpp
index 74c96bdf3ef3..27049aef6327 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.cpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.cpp
@@ -1017,6 +1017,49 @@ void check_DesyncObjectConsumedByMultipleIslands(const cv::gimpl::GIslandModel::
     } // for(nodes)
 }
 
+// NB: Construct GRunArgsP based on passed info and store the memory in passed cv::GRunArgs.
+// Needed for python bridge, because in case python user doesn't pass output arguments to apply.
+void constructOptGraphOutputs(const cv::GTypesInfo &out_info,
+                                    cv::GOptRunArgs &args,
+                                    cv::GOptRunArgsP &outs)
+{
+    for (auto&& info : out_info)
+    {
+        switch (info.shape)
+        {
+            case cv::GShape::GMAT:
+            {
+                args.emplace_back(cv::optional<cv::Mat>{});
+                outs.emplace_back(&cv::util::get<cv::optional<cv::Mat>>(args.back()));
+                break;
+            }
+            case cv::GShape::GSCALAR:
+            {
+                args.emplace_back(cv::optional<cv::Scalar>{});
+                outs.emplace_back(&cv::util::get<cv::optional<cv::Scalar>>(args.back()));
+                break;
+            }
+            case cv::GShape::GARRAY:
+            {
+                cv::detail::VectorRef ref;
+                cv::util::get<cv::detail::ConstructVec>(info.ctor)(ref);
+                args.emplace_back(cv::util::make_optional(std::move(ref)));
+                outs.emplace_back(wrap_opt_arg(cv::util::get<cv::optional<cv::detail::VectorRef>>(args.back())));
+                break;
+            }
+            case cv::GShape::GOPAQUE:
+            {
+                cv::detail::OpaqueRef ref;
+                cv::util::get<cv::detail::ConstructOpaque>(info.ctor)(ref);
+                args.emplace_back(cv::util::make_optional(std::move(ref)));
+                outs.emplace_back(wrap_opt_arg(cv::util::get<cv::optional<cv::detail::OpaqueRef>>(args.back())));
+                break;
+            }
+            default:
+                cv::util::throw_error(std::logic_error("Unsupported optional output shape for Python"));
+        }
+    }
+}
 } // anonymous namespace
 
 class cv::gimpl::GStreamingExecutor::Synchronizer final {
@@ -1320,6 +1363,16 @@ cv::gimpl::GStreamingExecutor::GStreamingExecutor(std::unique_ptr<ade::Graph> &&
     // per the same input frame, so the output traffic multiplies)
     GAPI_Assert(m_collector_map.size() > 0u);
     m_out_queue.set_capacity(queue_capacity * m_collector_map.size());
+
+    // FIXME: The code duplicates logic of collectGraphInfo()
+    cv::gimpl::GModel::ConstGraph cgr(*m_orig_graph);
+    auto meta = cgr.metadata().get<cv::gimpl::Protocol>().out_nhs;
+    out_info.reserve(meta.size());
+
+    ade::util::transform(meta, std::back_inserter(out_info), [&cgr](const ade::NodeHandle& nh) {
+        const auto& data = cgr.metadata(nh).get<cv::gimpl::Data>();
+        return cv::GTypeInfo{data.shape, data.kind, data.ctor};
+    });
 }
 
 cv::gimpl::GStreamingExecutor::~GStreamingExecutor()
@@ -1653,6 +1706,31 @@ bool cv::gimpl::GStreamingExecutor::pull(cv::GOptRunArgsP &&outs)
     return true;
 }
 
+std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> cv::gimpl::GStreamingExecutor::pull()
+{
+    using RunArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    bool is_over = false;
+
+    if (m_desync) {
+        GOptRunArgs opt_run_args;
+        GOptRunArgsP opt_outs;
+        opt_outs.reserve(out_info.size());
+        opt_run_args.reserve(out_info.size());
+
+        constructOptGraphOutputs(out_info, opt_run_args, opt_outs);
+        is_over = pull(std::move(opt_outs));
+        return std::make_tuple(is_over, RunArgs(opt_run_args));
+    }
+
+    GRunArgs run_args;
+    GRunArgsP outs;
+    run_args.reserve(out_info.size());
+    outs.reserve(out_info.size());
+
+    constructGraphOutputs(out_info, run_args, outs);
+    is_over = pull(std::move(outs));
+    return std::make_tuple(is_over, RunArgs(run_args));
+}
 
 bool cv::gimpl::GStreamingExecutor::try_pull(cv::GRunArgsP &&outs)
 {
diff --git a/modules/gapi/src/executor/gstreamingexecutor.hpp b/modules/gapi/src/executor/gstreamingexecutor.hpp
index 40b787268228..b4aadcbbaf4d 100644
--- a/modules/gapi/src/executor/gstreamingexecutor.hpp
+++ b/modules/gapi/src/executor/gstreamingexecutor.hpp
@@ -195,6 +195,8 @@ class GStreamingExecutor final
 
     void wait_shutdown();
 
+    cv::GTypesInfo out_info;
+
 public:
     explicit GStreamingExecutor(std::unique_ptr<ade::Graph> &&g_model,
                                 const cv::GCompileArgs &comp_args);
@@ -203,6 +205,7 @@ class GStreamingExecutor final
     void start();
     bool pull(cv::GRunArgsP &&outs);
     bool pull(cv::GOptRunArgsP &&outs);
+    std::tuple<bool, cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>> pull();
     bool try_pull(cv::GRunArgsP &&outs);
     void stop();
     bool running() const;
diff --git a/modules/gapi/src/streaming/onevpl/onevpl_source.cpp b/modules/gapi/src/streaming/onevpl/onevpl_source.cpp
new file mode 100644
index 000000000000..988986f6d9d9
--- /dev/null
+++ b/modules/gapi/src/streaming/onevpl/onevpl_source.cpp
@@ -0,0 +1,48 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include <opencv2/gapi/streaming/onevpl/onevpl_source.hpp>
+
+#include "streaming/onevpl/onevpl_source_priv.hpp"
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+#ifdef HAVE_ONEVPL
+OneVPLSource::OneVPLSource(const std::string& filePath) :
+    OneVPLSource(std::unique_ptr<Priv>(new OneVPLSource::Priv(filePath))) {
+
+    if (filePath.empty()) {
+        util::throw_error(std::logic_error("Cannot create 'OneVPLSource' on empty source file name"));
+    }
+}
+#else
+OneVPLSource::OneVPLSource(const std::string&) {
+    GAPI_Assert(false && "Unsupported: G-API compiled without `WITH_GAPI_ONEVPL=ON`");
+}
+#endif
+
+OneVPLSource::OneVPLSource(std::unique_ptr<Priv>&& impl) :
+    IStreamSource(),
+    m_priv(std::move(impl)) {
+}
+
+OneVPLSource::~OneVPLSource() {
+}
+
+bool OneVPLSource::pull(cv::gapi::wip::Data& data)
+{
+    return m_priv->pull(data);
+}
+
+GMetaArg OneVPLSource::descr_of() const
+{
+    return m_priv->descr_of();
+}
+} // namespace wip
+} // namespace gapi
+} // namespace cv
diff --git a/modules/gapi/src/streaming/onevpl/onevpl_source_priv.cpp b/modules/gapi/src/streaming/onevpl/onevpl_source_priv.cpp
new file mode 100644
index 000000000000..5c4e8e694175
--- /dev/null
+++ b/modules/gapi/src/streaming/onevpl/onevpl_source_priv.cpp
@@ -0,0 +1,63 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#include <algorithm>
+#include <sstream>
+
+#include "streaming/onevpl/onevpl_source_priv.hpp"
+#include "logger.hpp"
+
+#ifndef HAVE_ONEVPL
+namespace cv {
+namespace gapi {
+namespace wip {
+bool OneVPLSource::Priv::pull(cv::gapi::wip::Data&) {
+    return true;
+}
+GMetaArg OneVPLSource::Priv::descr_of() const {
+    return {};
+}
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#else // HAVE_ONEVPL
+
+namespace cv {
+namespace gapi {
+namespace wip {
+OneVPLSource::Priv::Priv() :
+    mfx_handle(MFXLoad())
+{
+    GAPI_LOG_INFO(nullptr, "Initialized MFX handle: " << mfx_handle);
+    description_is_valid = false;
+}
+
+OneVPLSource::Priv::Priv(const std::string&) :
+    OneVPLSource::Priv()
+{
+}
+
+OneVPLSource::Priv::~Priv()
+{
+    GAPI_LOG_INFO(nullptr, "Unload MFX handle: " << mfx_handle);
+    MFXUnload(mfx_handle);
+}
+
+bool OneVPLSource::Priv::pull(cv::gapi::wip::Data&)
+{
+    return false;
+}
+
+GMetaArg OneVPLSource::Priv::descr_of() const
+{
+    return {};
+}
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // HAVE_ONEVPL
diff --git a/modules/gapi/src/streaming/onevpl/onevpl_source_priv.hpp b/modules/gapi/src/streaming/onevpl/onevpl_source_priv.hpp
new file mode 100644
index 000000000000..b139add99372
--- /dev/null
+++ b/modules/gapi/src/streaming/onevpl/onevpl_source_priv.hpp
@@ -0,0 +1,62 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2021 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_PRIV_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_PRIV_HPP
+
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/streaming/meta.hpp>
+#include <opencv2/gapi/streaming/onevpl/onevpl_source.hpp>
+
+#ifdef HAVE_ONEVPL
+#if (MFX_VERSION >= 2000)
+#include <vpl/mfxdispatcher.h>
+#endif // MFX_VERSION
+
+#include <vpl/mfx.h>
+
+#include <vpl/mfxvideo.h>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+
+struct OneVPLSource::Priv
+{
+    explicit Priv(const std::string& file_path);
+    ~Priv();
+
+    bool pull(cv::gapi::wip::Data& data);
+    GMetaArg descr_of() const;
+private:
+    Priv();
+    mfxLoader mfx_handle;
+    bool description_is_valid;
+};
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#else // HAVE_ONEVPL
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct OneVPLSource::Priv final
+{
+    bool pull(cv::gapi::wip::Data&);
+    GMetaArg descr_of() const;
+};
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+#endif // HAVE_ONEVPL
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_ONEVPL_SOURCE_PRIV_HPP
diff --git a/modules/gapi/test/infer/gapi_infer_onnx_test.cpp b/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
index ef192b9d6a96..b1bf9c935694 100644
--- a/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
+++ b/modules/gapi/test/infer/gapi_infer_onnx_test.cpp
@@ -67,17 +67,17 @@ struct ONNXInitPath {
 static ONNXInitPath g_init_path;
 
 cv::Mat initMatrixRandU(const int type, const cv::Size& sz_in) {
-    const cv::Mat in_mat1 = cv::Mat(sz_in, type);
+    const cv::Mat in_mat = cv::Mat(sz_in, type);
 
     if (CV_MAT_DEPTH(type) < CV_32F) {
-        cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+        cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar::all(255));
     } else {
         const int fscale = 256;  // avoid bits near ULP, generate stable test input
-        cv::Mat in_mat32s(in_mat1.size(), CV_MAKE_TYPE(CV_32S, CV_MAT_CN(type)));
+        cv::Mat in_mat32s(in_mat.size(), CV_MAKE_TYPE(CV_32S, CV_MAT_CN(type)));
         cv::randu(in_mat32s, cv::Scalar::all(0), cv::Scalar::all(255 * fscale));
-        in_mat32s.convertTo(in_mat1, type, 1.0f / fscale, 0);
+        in_mat32s.convertTo(in_mat, type, 1.0f / fscale, 0);
     }
-    return in_mat1;
+    return in_mat;
 }
 } // anonymous namespace
 namespace opencv_test
@@ -319,15 +319,13 @@ class ONNXtest : public ::testing::Test {
     size_t num_in, num_out;
     std::vector<cv::Mat> out_gapi;
     std::vector<cv::Mat> out_onnx;
-    cv::Mat in_mat1;
+    cv::Mat in_mat;
 
     ONNXtest() {
         initTestDataPath();
         env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "test");
         memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
         out_gapi.resize(1);
-        // FIXME: It should be an image from own (gapi) directory in opencv extra
-        in_mat1 = cv::imread(findDataFile("cv/dpm/cat.png"));
     }
 
     template<typename T>
@@ -463,13 +461,9 @@ class ONNXMediaFrame : public ONNXClassification {
         cv::Rect(cv::Point{70, 10}, cv::Size{20, 260}),
         cv::Rect(cv::Point{5, 15}, cv::Size{200, 160}),
     };
-    cv::Mat m_in_y;
-    cv::Mat m_in_uv;
-    virtual void SetUp() {
-        cv::Size sz{640, 480};
-        m_in_y = initMatrixRandU(CV_8UC1, sz);
-        m_in_uv = initMatrixRandU(CV_8UC2, sz / 2);
-    }
+    const cv::Size sz{640, 480};
+    const cv::Mat m_in_y = initMatrixRandU(CV_8UC1, sz);
+    const cv::Mat m_in_uv = initMatrixRandU(CV_8UC2, sz / 2);
 };
 
 class ONNXGRayScale : public ONNXtest {
@@ -545,20 +539,20 @@ class ONNXYoloV3 : public ONNXWithRemap {
 public:
     std::vector<cv::Mat> ins;
 
-private:
-    virtual void SetUp() {
+    void constructYoloInputs(const cv::Mat& src) {
         const int yolo_in_h = 416;
         const int yolo_in_w = 416;
         cv::Mat yolov3_input, shape, prep_mat;
-        cv::resize(in_mat1, yolov3_input, cv::Size(yolo_in_w, yolo_in_h));
+        cv::resize(src, yolov3_input, cv::Size(yolo_in_w, yolo_in_h));
         shape.create(cv::Size(2, 1), CV_32F);
         float* ptr = shape.ptr<float>();
-        ptr[0] = in_mat1.cols;
-        ptr[1] = in_mat1.rows;
+        ptr[0] = src.cols;
+        ptr[1] = src.rows;
         preprocess(yolov3_input, prep_mat);
         ins = {prep_mat, shape};
     }
 
+private:
     void preprocess(const cv::Mat& src, cv::Mat& dst) {
         cv::Mat cvt;
         src.convertTo(cvt, CV_32F, 1.f / 255.f);
@@ -571,9 +565,10 @@ class ONNXYoloV3 : public ONNXWithRemap {
 TEST_F(ONNXClassification, Infer)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
     cv::Mat processed_mat;
-    preprocess(in_mat1, processed_mat);
+    preprocess(in_mat, processed_mat);
     infer<float>(processed_mat, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
@@ -583,7 +578,7 @@ TEST_F(ONNXClassification, Infer)
     // NOTE: We have to normalize U8 tensor
     // so cfgMeanStd() is here
     auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
-    comp.apply(cv::gin(in_mat1),
+    comp.apply(cv::gin(in_mat),
                cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -593,9 +588,10 @@ TEST_F(ONNXClassification, Infer)
 TEST_F(ONNXClassification, InferTensor)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // Create tensor
     cv::Mat tensor;
-    preprocess(in_mat1, tensor);
+    preprocess(in_mat, tensor);
     // ONNX_API code
     infer<float>(tensor, out_onnx);
     // G_API code
@@ -614,10 +610,11 @@ TEST_F(ONNXClassification, InferTensor)
 TEST_F(ONNXClassification, InferROI)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     const auto ROI = rois.at(0);
     // ONNX_API code
     cv::Mat roi_mat;
-    preprocess(in_mat1(ROI), roi_mat);
+    preprocess(in_mat(ROI), roi_mat);
     infer<float>(roi_mat, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
@@ -628,7 +625,7 @@ TEST_F(ONNXClassification, InferROI)
     // NOTE: We have to normalize U8 tensor
     // so cfgMeanStd() is here
     auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
-    comp.apply(cv::gin(in_mat1, ROI),
+    comp.apply(cv::gin(in_mat, ROI),
                cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -638,10 +635,11 @@ TEST_F(ONNXClassification, InferROI)
 TEST_F(ONNXClassification, InferROIList)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
-        preprocess(in_mat1(rois[i]), roi_mat);
+        preprocess(in_mat(rois[i]), roi_mat);
         infer<float>(roi_mat, out_onnx);
     }
     // G_API code
@@ -653,7 +651,7 @@ TEST_F(ONNXClassification, InferROIList)
     // NOTE: We have to normalize U8 tensor
     // so cfgMeanStd() is here
     auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
-    comp.apply(cv::gin(in_mat1, rois),
+    comp.apply(cv::gin(in_mat, rois),
                cv::gout(out_gapi),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -663,10 +661,11 @@ TEST_F(ONNXClassification, InferROIList)
 TEST_F(ONNXClassification, Infer2ROIList)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
-        preprocess(in_mat1(rois[i]), roi_mat);
+        preprocess(in_mat(rois[i]), roi_mat);
         infer<float>(roi_mat, out_onnx);
     }
     // G_API code
@@ -678,7 +677,7 @@ TEST_F(ONNXClassification, Infer2ROIList)
     // NOTE: We have to normalize U8 tensor
     // so cfgMeanStd() is here
     auto net = cv::gapi::onnx::Params<SqueezNet> { model_path }.cfgMeanStd({ mean }, { std });
-    comp.apply(cv::gin(in_mat1, rois),
+    comp.apply(cv::gin(in_mat, rois),
                cv::gout(out_gapi),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -688,9 +687,10 @@ TEST_F(ONNXClassification, Infer2ROIList)
 TEST_F(ONNXWithRemap, InferDynamicInputTensor)
 {
     useModel("object_detection_segmentation/tiny-yolov2/model/tinyyolov2-8");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // Create tensor
     cv::Mat cvt, rsz, tensor;
-    cv::resize(in_mat1, rsz, cv::Size{416, 416});
+    cv::resize(in_mat, rsz, cv::Size{416, 416});
     rsz.convertTo(cvt, CV_32F, 1.f / 255.f);
     toCHW(cvt, tensor);
     tensor = tensor.reshape(1, {1, 3, 416, 416});
@@ -714,9 +714,10 @@ TEST_F(ONNXWithRemap, InferDynamicInputTensor)
 TEST_F(ONNXGRayScale, InferImage)
 {
     useModel("body_analysis/emotion_ferplus/model/emotion-ferplus-8");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
     cv::Mat prep_mat;
-    preprocess(in_mat1, prep_mat);
+    preprocess(in_mat, prep_mat);
     infer<float>(prep_mat, out_onnx);
     // G_API code
     G_API_NET(EmotionNet, <cv::GMat(cv::GMat)>, "emotion-ferplus");
@@ -725,7 +726,7 @@ TEST_F(ONNXGRayScale, InferImage)
     cv::GComputation comp(cv::GIn(in), cv::GOut(out));
     auto net = cv::gapi::onnx::Params<EmotionNet> { model_path }
         .cfgNormalize({ false }); // model accepts 0..255 range in FP32;
-    comp.apply(cv::gin(in_mat1),
+    comp.apply(cv::gin(in_mat),
                cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -735,8 +736,9 @@ TEST_F(ONNXGRayScale, InferImage)
 TEST_F(ONNXWithRemap, InferMultiOutput)
 {
     useModel("object_detection_segmentation/ssd-mobilenetv1/model/ssd_mobilenet_v1_10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
-    const auto prep_mat = in_mat1.reshape(1, {1, in_mat1.rows, in_mat1.cols, in_mat1.channels()});
+    const auto prep_mat = in_mat.reshape(1, {1, in_mat.rows, in_mat.cols, in_mat.channels()});
     infer<uint8_t>(prep_mat, out_onnx);
     cv::Mat onnx_conv_out({1, 1, 200, 7}, CV_32F);
     remapToIESSDOut({out_onnx[3], out_onnx[0], out_onnx[2], out_onnx[1]}, onnx_conv_out);
@@ -750,7 +752,7 @@ TEST_F(ONNXWithRemap, InferMultiOutput)
     auto net = cv::gapi::onnx::Params<MobileNet>{ model_path }
         .cfgOutputLayers({"detection_output"})
         .cfgPostProc({cv::GMatDesc{CV_32F, {1, 1, 200, 7}}}, remapSSDPorts);
-    comp.apply(cv::gin(in_mat1),
+    comp.apply(cv::gin(in_mat),
                cv::gout(out_gapi.front()),
                cv::compile_args(cv::gapi::networks(net)));
     // Validate
@@ -760,12 +762,13 @@ TEST_F(ONNXWithRemap, InferMultiOutput)
 TEST_F(ONNXMediaFrame, InferBGR)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // ONNX_API code
     cv::Mat processed_mat;
-    preprocess(in_mat1, processed_mat);
+    preprocess(in_mat, processed_mat);
     infer<float>(processed_mat, out_onnx);
     // G_API code
-    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
     cv::GFrame in;
     cv::GMat out = cv::gapi::infer<SqueezNet>(in);
@@ -783,6 +786,7 @@ TEST_F(ONNXMediaFrame, InferBGR)
 TEST_F(ONNXMediaFrame, InferYUV)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
     // ONNX_API code
     cv::Mat pp;
@@ -808,10 +812,11 @@ TEST_F(ONNXMediaFrame, InferYUV)
 TEST_F(ONNXMediaFrame, InferROIBGR)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
+    auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
     // ONNX_API code
     cv::Mat roi_mat;
-    preprocess(in_mat1(rois.front()), roi_mat);
+    preprocess(in_mat(rois.front()), roi_mat);
     infer<float>(roi_mat, out_onnx);
     // G_API code
     G_API_NET(SqueezNet, <cv::GMat(cv::GMat)>, "squeeznet");
@@ -832,6 +837,7 @@ TEST_F(ONNXMediaFrame, InferROIBGR)
 TEST_F(ONNXMediaFrame, InferROIYUV)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
     // ONNX_API code
     cv::Mat pp;
@@ -858,11 +864,12 @@ TEST_F(ONNXMediaFrame, InferROIYUV)
 TEST_F(ONNXMediaFrame, InferListBGR)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
+    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
     // ONNX_API code
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
-        preprocess(in_mat1(rois[i]), roi_mat);
+        preprocess(in_mat(rois[i]), roi_mat);
         infer<float>(roi_mat, out_onnx);
     }
     // G_API code
@@ -884,6 +891,7 @@ TEST_F(ONNXMediaFrame, InferListBGR)
 TEST_F(ONNXMediaFrame, InferListYUV)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
     // ONNX_API code
     cv::Mat pp;
@@ -911,8 +919,9 @@ TEST_F(ONNXMediaFrame, InferListYUV)
 TEST_F(ONNXRCNN, InferWithDisabledOut)
 {
     useModel("object_detection_segmentation/faster-rcnn/model/FasterRCNN-10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     cv::Mat pp;
-    preprocess(in_mat1, pp);
+    preprocess(in_mat, pp);
     // ONNX_API code
     infer<float>(pp, out_onnx, {"6379", "6383"});
     // G_API code
@@ -937,11 +946,12 @@ TEST_F(ONNXRCNN, InferWithDisabledOut)
 TEST_F(ONNXMediaFrame, InferList2BGR)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
-    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat1);
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
+    const auto frame = MediaFrame::Create<TestMediaBGR>(in_mat);
     // ONNX_API code
     for (size_t i = 0; i < rois.size(); ++i) {
         cv::Mat roi_mat;
-        preprocess(in_mat1(rois[i]), roi_mat);
+        preprocess(in_mat(rois[i]), roi_mat);
         infer<float>(roi_mat, out_onnx);
     }
     // G_API code
@@ -963,6 +973,7 @@ TEST_F(ONNXMediaFrame, InferList2BGR)
 TEST_F(ONNXMediaFrame, InferList2YUV)
 {
     useModel("classification/squeezenet/model/squeezenet1.0-9");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     const auto frame = MediaFrame::Create<TestMediaNV12>(m_in_y, m_in_uv);
     // ONNX_API code
     cv::Mat pp;
@@ -991,6 +1002,8 @@ TEST_F(ONNXMediaFrame, InferList2YUV)
 TEST_F(ONNXYoloV3, InferConstInput)
 {
     useModel("object_detection_segmentation/yolov3/model/yolov3-10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
+    constructYoloInputs(in_mat);
     // ONNX_API code
     infer<float>(ins, out_onnx);
     // G_API code
@@ -1022,6 +1035,8 @@ TEST_F(ONNXYoloV3, InferBSConstInput)
     // and all input layer names are specified.
     // Const input has the advantage. It is expected behavior.
     useModel("object_detection_segmentation/yolov3/model/yolov3-10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
+    constructYoloInputs(in_mat);
     // Tensor with incorrect image size
     // is used for check case when InputLayers and constInput have same names
     cv::Mat bad_shape;
@@ -1059,8 +1074,9 @@ TEST_F(ONNXYoloV3, InferBSConstInput)
 TEST_F(ONNXRCNN, ConversionInt64to32)
 {
     useModel("object_detection_segmentation/faster-rcnn/model/FasterRCNN-10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     cv::Mat dst;
-    preprocess(in_mat1, dst);
+    preprocess(in_mat, dst);
     // ONNX_API code
     infer<float>(dst, out_onnx);
     // G_API code
@@ -1087,6 +1103,7 @@ TEST_F(ONNXRCNN, ConversionInt64to32)
 TEST_F(ONNXWithRemap, InferOutReallocation)
 {
     useModel("object_detection_segmentation/ssd-mobilenetv1/model/ssd_mobilenet_v1_10");
+    in_mat = cv::imread(findDataFile("cv/dpm/cat.png", false));
     // G_API code
     G_API_NET(MobileNet, <cv::GMat(cv::GMat)>, "ssd_mobilenet");
     auto net = cv::gapi::onnx::Params<MobileNet>{model_path}
@@ -1096,7 +1113,7 @@ TEST_F(ONNXWithRemap, InferOutReallocation)
     cv::GMat out1;
     out1 = cv::gapi::infer<MobileNet>(in);
     cv::GComputation comp(cv::GIn(in), cv::GOut(out1));
-    EXPECT_THROW(comp.apply(cv::gin(in_mat1),
+    EXPECT_THROW(comp.apply(cv::gin(in_mat),
                  cv::gout(out_gapi[0]),
                  cv::compile_args(cv::gapi::networks(net))), std::exception);
 }
diff --git a/modules/gapi/test/render/gapi_render_tests_ocv.cpp b/modules/gapi/test/render/gapi_render_tests_ocv.cpp
index 010df5dff75b..95f695415609 100644
--- a/modules/gapi/test/render/gapi_render_tests_ocv.cpp
+++ b/modules/gapi/test/render/gapi_render_tests_ocv.cpp
@@ -639,8 +639,8 @@ INSTANTIATE_TEST_CASE_P(RenderBGROCVTestRectsImpl, RenderBGROCVTestRects,
                                 Values(cv::Rect(100, 100, 200, 200)),
                                 Values(cv::Scalar(100, 50, 150)),
                                 Values(2),
-                                Values(LINE_8),
-                                Values(0)));
+                                Values(LINE_8, LINE_4),
+                                Values(0, 1)));
 
 INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestRectsImpl, RenderNV12OCVTestRects,
                         Combine(Values(cv::Size(1280, 720)),
@@ -673,8 +673,8 @@ INSTANTIATE_TEST_CASE_P(RenderNV12OCVTestCirclesImpl, RenderNV12OCVTestCircles,
                                 Values(10),
                                 Values(cv::Scalar(100, 50, 150)),
                                 Values(2),
-                                Values(LINE_8),
-                                Values(0)));
+                                Values(LINE_8, LINE_4),
+                                Values(0, 1)));
 
 INSTANTIATE_TEST_CASE_P(RenderMFrameOCVTestCirclesImpl, RenderMFrameOCVTestCircles,
                         Combine(Values(cv::Size(1280, 720)),
diff --git a/modules/gapi/test/s11n/gapi_s11n_tests.cpp b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
index f4a30b394631..4c6e63b55204 100644
--- a/modules/gapi/test/s11n/gapi_s11n_tests.cpp
+++ b/modules/gapi/test/s11n/gapi_s11n_tests.cpp
@@ -2,6 +2,7 @@
 
 #include "backends/common/serialization.hpp"
 #include <opencv2/gapi/rmat.hpp>
+#include <opencv2/gapi/media.hpp>
 #include <../src/backends/common/gbackend.hpp> // asView
 
 namespace {
@@ -148,6 +149,29 @@ class MyRMatAdapter : public cv::RMat::Adapter {
     int getVal() { return m_value; }
     std::string getStr() { return m_str; }
 };
+
+class MyMediaFrameAdapter : public cv::MediaFrame::IAdapter {
+    cv::Mat m_mat;
+    int m_value;
+    std::string m_str;
+public:
+    MyMediaFrameAdapter() = default;
+    MyMediaFrameAdapter(cv::Mat m, int value, const std::string& str)
+        : m_mat(m), m_value(value), m_str(str)
+    {}
+    virtual cv::MediaFrame::View access(cv::MediaFrame::Access) override {
+        return cv::MediaFrame::View({m_mat.data}, {m_mat.step});
+    }
+    virtual cv::GFrameDesc meta() const override { return {cv::MediaFormat::BGR, m_mat.size()}; }
+    virtual void serialize(cv::gapi::s11n::IOStream& os) override {
+        os << m_value << m_str;
+    }
+    virtual void deserialize(cv::gapi::s11n::IIStream& is) override {
+        is >> m_value >> m_str;
+    }
+    int getVal() { return m_value; }
+    std::string getStr() { return m_str; }
+};
 }
 
 namespace opencv_test {
@@ -581,6 +605,17 @@ TEST_F(S11N_Basic, Test_Vector_Of_Strings) {
     EXPECT_EQ("42", des[2]);
 }
 
+TEST_F(S11N_Basic, Test_RunArg) {
+    cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+    auto v = cv::GRunArgs{ cv::GRunArg{ mat } };
+
+    const std::vector<char> sargsin = cv::gapi::serialize(v);
+    cv::GRunArgs out = cv::gapi::deserialize<cv::GRunArgs>(sargsin);
+    cv::Mat out_mat = cv::util::get<cv::Mat>(out[0]);
+
+    EXPECT_EQ(0, cv::norm(mat, out_mat));
+}
+
 TEST_F(S11N_Basic, Test_RunArg_RMat) {
     cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
     cv::RMat rmat = cv::make_rmat<MyRMatAdapter>(mat, 42, "It actually works");
@@ -614,6 +649,87 @@ TEST_F(S11N_Basic, Test_RunArg_RMat_Scalar_Mat) {
     EXPECT_EQ(0, cv::norm(mat, out_mat));
 }
 
+TEST_F(S11N_Basic, Test_RunArg_MediaFrame) {
+    cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+    auto frame = cv::MediaFrame::Create<MyMediaFrameAdapter>(mat, 42, "It actually works");
+    auto v = cv::GRunArgs{ cv::GRunArg{ frame } };
+
+    const std::vector<char> sargsin = cv::gapi::serialize(v);
+    cv::GRunArgs out = cv::gapi::deserialize<cv::GRunArgs, MyMediaFrameAdapter>(sargsin);
+    cv::MediaFrame out_mat = cv::util::get<cv::MediaFrame>(out[0]);
+    auto adapter = out_mat.get<MyMediaFrameAdapter>();
+    EXPECT_EQ(42, adapter->getVal());
+    EXPECT_EQ("It actually works", adapter->getStr());
+}
+
+TEST_F(S11N_Basic, Test_RunArg_MediaFrame_Scalar_Mat) {
+    cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+    auto frame = cv::MediaFrame::Create<MyMediaFrameAdapter>(mat, 42, "It actually works");
+    cv::Scalar sc(111);
+    auto v = cv::GRunArgs{ cv::GRunArg{ frame }, cv::GRunArg{ sc }, cv::GRunArg{ mat } };
+
+    const std::vector<char> sargsin = cv::gapi::serialize(v);
+    cv::GRunArgs out = cv::gapi::deserialize<cv::GRunArgs, MyMediaFrameAdapter>(sargsin);
+    cv::MediaFrame out_frame = cv::util::get<cv::MediaFrame>(out[0]);
+    auto adapter = out_frame.get<MyMediaFrameAdapter>();
+    EXPECT_EQ(42, adapter->getVal());
+    EXPECT_EQ("It actually works", adapter->getStr());
+
+    cv::Scalar out_sc = cv::util::get<cv::Scalar>(out[1]);
+    EXPECT_EQ(sc, out_sc);
+
+    cv::Mat out_mat = cv::util::get<cv::Mat>(out[2]);
+    EXPECT_EQ(0, cv::norm(mat, out_mat));
+}
+
+TEST_F(S11N_Basic, Test_RunArg_MediaFrame_RMat) {
+    cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+    cv::Mat mat2 = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+
+    auto frame = cv::MediaFrame::Create<MyMediaFrameAdapter>(mat, 42, "It actually works");
+    auto rmat = cv::make_rmat<MyRMatAdapter>(mat2, 24, "Hello there");
+
+    auto v = cv::GRunArgs{ cv::GRunArg{ frame }, cv::GRunArg{ rmat } };
+
+    const std::vector<char> sargsin = cv::gapi::serialize(v);
+    cv::GRunArgs out = cv::gapi::deserialize<cv::GRunArgs, MyMediaFrameAdapter, MyRMatAdapter>(sargsin);
+
+    cv::MediaFrame out_frame = cv::util::get<cv::MediaFrame>(out[0]);
+    cv::RMat out_rmat = cv::util::get<cv::RMat>(out[1]);
+
+    auto adapter = out_frame.get<MyMediaFrameAdapter>();
+    EXPECT_EQ(42, adapter->getVal());
+    EXPECT_EQ("It actually works", adapter->getStr());
+
+    auto adapter2 = out_rmat.get<MyRMatAdapter>();
+    EXPECT_EQ(24, adapter2->getVal());
+    EXPECT_EQ("Hello there", adapter2->getStr());
+}
+
+TEST_F(S11N_Basic, Test_RunArg_RMat_MediaFrame) {
+    cv::Mat mat = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+    cv::Mat mat2 = cv::Mat::eye(cv::Size(128, 64), CV_8UC3);
+
+    auto frame = cv::MediaFrame::Create<MyMediaFrameAdapter>(mat, 42, "It actually works");
+    auto rmat = cv::make_rmat<MyRMatAdapter>(mat2, 24, "Hello there");
+
+    auto v = cv::GRunArgs{ cv::GRunArg{ rmat }, cv::GRunArg{ frame } };
+
+    const std::vector<char> sargsin = cv::gapi::serialize(v);
+    cv::GRunArgs out = cv::gapi::deserialize<cv::GRunArgs, MyMediaFrameAdapter, MyRMatAdapter>(sargsin);
+
+    cv::RMat out_rmat = cv::util::get<cv::RMat>(out[0]);
+    cv::MediaFrame out_frame = cv::util::get<cv::MediaFrame>(out[1]);
+
+    auto adapter = out_frame.get<MyMediaFrameAdapter>();
+    EXPECT_EQ(42, adapter->getVal());
+    EXPECT_EQ("It actually works", adapter->getStr());
+
+    auto adapter2 = out_rmat.get<MyRMatAdapter>();
+    EXPECT_EQ(24, adapter2->getVal());
+    EXPECT_EQ("Hello there", adapter2->getStr());
+}
+
 namespace {
     template <cv::detail::OpaqueKind K, typename T>
     bool verifyOpaqueKind(T&& in) {
@@ -754,8 +870,6 @@ TEST_F(S11N_Basic, Test_Deserialize_CompileArgs_RandomOrder) {
     std::vector<char> sArgs = cv::gapi::serialize(
         cv::compile_args(simpleCustomVar, simpleCustomVar2));
     GCompileArgs dArgs = cv::gapi::deserialize<GCompileArgs,
-                                               // Here, types of passed to serialize() arguments
-                                               // are enumerated in reverse order
                                                SimpleCustomType2,
                                                SimpleCustomType>(sArgs);
 
diff --git a/modules/gapi/test/s11n/gapi_sample_pipelines_s11n.cpp b/modules/gapi/test/s11n/gapi_sample_pipelines_s11n.cpp
index 885457cd9063..c3d21a3f6f8c 100644
--- a/modules/gapi/test/s11n/gapi_sample_pipelines_s11n.cpp
+++ b/modules/gapi/test/s11n/gapi_sample_pipelines_s11n.cpp
@@ -806,4 +806,33 @@ TEST(S11N, Pipeline_Render_RGB)
 
     EXPECT_EQ(cv::norm(input,  ref_mat), 0);
 }
+
+TEST(S11N, Pipeline_Const_GScalar)
+{
+    static constexpr auto in_scalar = 10;
+
+    cv::GMat a;
+    cv::GScalar s;
+
+    cv::GComputation computation(GIn(a), GOut(cv::gapi::addC(a, in_scalar)));
+    auto p = cv::gapi::serialize(computation);
+    auto deserialized_computation = cv::gapi::deserialize<cv::GComputation>(p);
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat ref_mat;
+    cv::add(in_mat, in_scalar, ref_mat);
+
+    cv::Mat out_mat;
+    computation.apply(cv::gin(in_mat/*, in_scalar*/), cv::gout(out_mat));
+    EXPECT_EQ(0, cvtest::norm(out_mat, ref_mat, NORM_INF));
+
+    out_mat = cv::Mat();
+    deserialized_computation.apply(cv::gin(in_mat/*, in_scalar*/), cv::gout(out_mat));
+    EXPECT_EQ(0, cvtest::norm(out_mat, ref_mat, NORM_INF));
+
+    out_mat = cv::Mat();
+    auto cc = deserialized_computation.compile(cv::descr_of(in_mat));
+    cc(cv::gin(in_mat/*, in_scalar*/), cv::gout(out_mat));
+    EXPECT_EQ(0, cvtest::norm(out_mat, ref_mat, NORM_INF));
+}
 } // namespace opencv_test
diff --git a/modules/gapi/test/streaming/gapi_streaming_tests.cpp b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
index f3179a70813a..5386d1736f67 100644
--- a/modules/gapi/test/streaming/gapi_streaming_tests.cpp
+++ b/modules/gapi/test/streaming/gapi_streaming_tests.cpp
@@ -244,6 +244,35 @@ class NV12Source : public cv::gapi::wip::GCaptureSource {
     }
 };
 
+void checkPullOverload(const cv::Mat& ref,
+                       const bool has_output,
+                       cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>& args) {
+    EXPECT_TRUE(has_output);
+    using runArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    cv::Mat out_mat;
+    switch (args.index()) {
+        case runArgs::index_of<cv::GRunArgs>():
+        {
+            auto outputs = util::get<cv::GRunArgs>(args);
+            EXPECT_EQ(1u, outputs.size());
+            out_mat = cv::util::get<cv::Mat>(outputs[0]);
+            break;
+        }
+        case runArgs::index_of<cv::GOptRunArgs>():
+        {
+            auto outputs = util::get<cv::GOptRunArgs>(args);
+            EXPECT_EQ(1u, outputs.size());
+            auto opt_mat = cv::util::get<cv::optional<cv::Mat>>(outputs[0]);
+            ASSERT_TRUE(opt_mat.has_value());
+            out_mat = *opt_mat;
+            break;
+        }
+        default: GAPI_Assert(false && "Incorrect type of Args");
+    }
+
+    EXPECT_EQ(0., cv::norm(ref, out_mat, cv::NORM_INF));
+}
+
 } // anonymous namespace
 
 TEST_P(GAPI_Streaming, SmokeTest_ConstInput_GMat)
@@ -1336,13 +1365,45 @@ TEST(Streaming, Python_Pull_Overload)
 
     bool has_output;
     cv::GRunArgs outputs;
-    std::tie(has_output, outputs) = ccomp.pull();
+    using RunArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    RunArgs args;
 
-    EXPECT_TRUE(has_output);
-    EXPECT_EQ(1u, outputs.size());
+    std::tie(has_output, args) = ccomp.pull();
+
+    checkPullOverload(in_mat, has_output, args);
+
+    ccomp.stop();
+    EXPECT_FALSE(ccomp.running());
+}
+
+TEST(GAPI_Streaming_Desync, Python_Pull_Overload)
+{
+    cv::GMat in;
+    cv::GMat out = cv::gapi::streaming::desync(in);
+    cv::GComputation c(in, out);
+
+    cv::Size sz(3,3);
+    cv::Mat in_mat(sz, CV_8UC3);
+    cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar(255));
 
-    auto out_mat = cv::util::get<cv::Mat>(outputs[0]);
-    EXPECT_EQ(0., cv::norm(in_mat, out_mat, cv::NORM_INF));
+    auto ccomp = c.compileStreaming();
+
+    EXPECT_TRUE(ccomp);
+    EXPECT_FALSE(ccomp.running());
+
+    ccomp.setSource(cv::gin(in_mat));
+
+    ccomp.start();
+    EXPECT_TRUE(ccomp.running());
+
+    bool has_output;
+    cv::GRunArgs outputs;
+    using RunArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    RunArgs args;
+
+    std::tie(has_output, args) = ccomp.pull();
+
+    checkPullOverload(in_mat, has_output, args);
 
     ccomp.stop();
     EXPECT_FALSE(ccomp.running());
@@ -2132,9 +2193,17 @@ TEST(GAPI_Streaming, TestPythonAPI)
 
     bool is_over = false;
     cv::GRunArgs out_args;
+    using RunArgs = cv::util::variant<cv::GRunArgs, cv::GOptRunArgs>;
+    RunArgs args;
 
     // NB: Used by python bridge
-    std::tie(is_over, out_args) = cc.pull();
+    std::tie(is_over, args) = cc.pull();
+
+    switch (args.index()) {
+        case RunArgs::index_of<cv::GRunArgs>():
+            out_args = util::get<cv::GRunArgs>(args); break;
+        default: GAPI_Assert(false && "Incorrect type of return value");
+    }
 
     ASSERT_EQ(1u, out_args.size());
     ASSERT_TRUE(cv::util::holds_alternative<cv::Mat>(out_args[0]));
diff --git a/modules/gapi/test/util/variant_tests.cpp b/modules/gapi/test/util/variant_tests.cpp
index 65d5e579f81b..7725f9a70211 100644
--- a/modules/gapi/test/util/variant_tests.cpp
+++ b/modules/gapi/test/util/variant_tests.cpp
@@ -354,6 +354,20 @@ TEST(Variant, Get)
     EXPECT_THROW(util::get<int>(cv2), util::bad_variant_access);
 }
 
+TEST(Variant, GetIndexed)
+{
+    const TestVar cv(42);
+
+    // Test const& get()
+    EXPECT_EQ(42, util::get<0>(cv));
+    EXPECT_THROW(util::get<1>(cv), util::bad_variant_access);
+
+    // Test &get
+    TestVar cv2(std::string("42"));
+    EXPECT_EQ("42", util::get<1>(cv2));
+    EXPECT_THROW(util::get<0>(cv2), util::bad_variant_access);
+}
+
 TEST(Variant, GetWrite)
 {
     util::variant<int, std::string> v(42);
@@ -486,4 +500,240 @@ TEST(Variant, EXT_IndexOf)
     static_assert(6u == V::index_of<MyClass>(), "Index is incorrect");
 }
 
+namespace test_validation
+{
+struct MyType
+{
+    friend std::ostream& operator<<(std::ostream& out, const MyType& src)
+    {
+        return out << "MyType"; (void) src;
+    }
+};
+class MyClass
+{
+    friend std::ostream& operator<<(std::ostream& out, const MyClass& src)
+    {
+        return out << "MyClass"; (void) src;
+    }
+};
+
+struct MyBoolParamIndexedVisitor : cv::util::static_indexed_visitor<bool, MyBoolParamIndexedVisitor>
+{
+    MyBoolParamIndexedVisitor(std::ostream &output) : out(output) {}
+
+    template<class Type>
+    bool visit(std::size_t index, Type val, int check)
+    {
+        bool result = false;
+        out << index << ":" << val <<",";
+        if(std::is_same<Type, int>::value)
+        {
+            result = !memcmp(&val, &check, sizeof(int));
+        }
+        return result;
+    }
+
+    std::ostream &out;
+};
+
+struct MyBoolNoParamNonIndexedVisitor : cv::util::static_indexed_visitor<bool, MyBoolNoParamNonIndexedVisitor>
+{
+    MyBoolNoParamNonIndexedVisitor(std::ostream &output) : out(output) {}
+
+    template<class Type>
+    bool visit(std::size_t index, Type val)
+    {
+        out << index << ":" << val <<",";
+        return true;
+    }
+    std::ostream &out;
+};
+
+
+struct MyVoidNoParamNonIndexedVisitor : cv::util::static_visitor<void, MyVoidNoParamNonIndexedVisitor>
+{
+    MyVoidNoParamNonIndexedVisitor(std::ostream &output) : out(output) {}
+
+    template<class Type>
+    void visit(Type val)
+    {
+        out << val << ",";
+    }
+
+    std::ostream &out;
+};
+
+
+struct MyVoidNoParamIndexedVisitor : cv::util::static_indexed_visitor<void, MyVoidNoParamIndexedVisitor>
+{
+    MyVoidNoParamIndexedVisitor(std::ostream &output) : out(output) {}
+
+    template<class Type>
+    void visit(std::size_t Index, Type val)
+    {
+        out << Index << ":" << val <<",";
+    }
+
+    std::ostream &out;
+};
+}
+
+TEST(Variant, DynamicVisitor)
+{
+    using V = cv::util::variant<int, double, char, float, test_validation::MyType, test_validation::MyClass>;
+    V var{42};
+    {
+        std::stringstream ss;
+        test_validation::MyBoolParamIndexedVisitor visitor(ss);
+
+        EXPECT_TRUE(cv::util::visit(visitor, var, int{42}));
+        EXPECT_EQ(ss.str(), std::string("0:42,"));
+    }
+
+    std::stringstream ss;
+    test_validation::MyBoolNoParamNonIndexedVisitor visitor(ss);
+
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("0:42,"));
+
+    var = double{1.0};
+    EXPECT_TRUE(cv::util::visit(visitor, var));
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,"));
+
+    var = char{'a'};
+    EXPECT_TRUE(cv::util::visit(visitor, var));
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,"));
+
+    var = float{6.0};
+    EXPECT_TRUE(cv::util::visit(visitor, var));
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,"));
+
+    var = test_validation::MyType{};
+    EXPECT_TRUE(cv::util::visit(visitor, var));
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,4:MyType,"));
+
+    var = test_validation::MyClass{};
+    EXPECT_TRUE(cv::util::visit(visitor, var));
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,4:MyType,5:MyClass,"));
+}
+
+TEST(Variant, StaticVisitor)
+{
+    using V = cv::util::variant<int, double, char, float, test_validation::MyType, test_validation::MyClass>;
+    V var{42};
+    std::stringstream ss;
+    test_validation::MyVoidNoParamNonIndexedVisitor visitor(ss);
+
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,"));
+
+    var = double{1.0};
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,1,"));
+
+    var = char{'a'};
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,1,a,"));
+
+    var = float{6.0};
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,1,a,6,"));
+
+    var = test_validation::MyType{};
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,1,a,6,MyType,"));
+
+    var = test_validation::MyClass{};
+    cv::util::visit(visitor, var);
+    EXPECT_EQ(ss.str(), std::string("42,1,a,6,MyType,MyClass,"));
+}
+
+TEST(Variant, StaticIndexedVisitor)
+{
+    using V = cv::util::variant<int, double, char, float, test_validation::MyType, test_validation::MyClass>;
+    V var{42};
+
+    std::stringstream ss;
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor {ss}, var);
+    EXPECT_EQ(ss.str(), std::string("0:42,"));
+
+    var = double{1.0};
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor (ss), var);
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,"));
+
+    var = char{'a'};
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor (ss), var);
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,"));
+
+    var = float{6.0};
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor (ss), var);
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,"));
+
+    var = test_validation::MyType{};
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor (ss), var);
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,4:MyType,"));
+
+    var = test_validation::MyClass{};
+    cv::util::visit(test_validation::MyVoidNoParamIndexedVisitor (ss), var);
+    EXPECT_EQ(ss.str(), std::string("0:42,1:1,2:a,3:6,4:MyType,5:MyClass,"));
+}
+
+
+TEST(Variant, LambdaVisitor)
+{
+    using V = cv::util::variant<int, double, char, float, test_validation::MyType, test_validation::MyClass>;
+    V var{42};
+    {
+        cv::util::visit(cv::util::overload_lambdas(
+                [](int value) {
+                    EXPECT_EQ(value, 42);
+                },
+                [](double) {
+                    ADD_FAILURE() << "can't be called for `double`";
+                },
+                [](char) {
+                    ADD_FAILURE() << "can't be called for `char`";
+                },
+                [](float) {
+                    ADD_FAILURE() << "can't be called for `float`";
+                },
+                [](test_validation::MyType) {
+                    ADD_FAILURE() << "can't be called for `MyType`";
+                },
+                [](test_validation::MyClass) {
+                    ADD_FAILURE() << "can't be called for `MyClass`";
+                },
+                [](std::string) {
+                    ADD_FAILURE() << "can't be called for `std::string`, invalid type";
+                }
+                ), var);
+    }
+
+    var = 'c';
+    {
+        cv::util::visit(cv::util::overload_lambdas(
+                [](int) {
+                    ADD_FAILURE() << "can't be called for `int`";
+                },
+                [](double) {
+                    ADD_FAILURE() << "can't be called for `double`";
+                },
+                [](char value) {
+                    EXPECT_EQ(value, 'c');
+                },
+                [](float) {
+                    ADD_FAILURE() << "can't be called for `float`";
+                },
+                [](test_validation::MyType) {
+                    ADD_FAILURE() << "can't be called for `MyType`";
+                },
+                [](test_validation::MyClass) {
+                    ADD_FAILURE() << "can't be called for `MyClass`";
+                },
+                [](std::string) {
+                    ADD_FAILURE() << "can't be called for `std::string`, invalid type";
+                }
+                ), var);
+    }
+}
 } // namespace opencv_test
diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt
index b4d4b9f50384..2b630bfed80d 100644
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -131,12 +131,6 @@ elseif(WINRT)
     message(STATUS "  ${name}:   Removing 'comctl32.lib, gdi32.lib, ole32.lib, setupapi.lib'")
     message(STATUS "  ${name}:   Leaving '${HIGHGUI_LIBRARIES}'")
   endif()
-elseif(HAVE_WIN32UI)
-  set(OPENCV_HIGHGUI_BUILTIN_BACKEND "WIN32UI")
-  list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_w32.cpp)
-  if(OpenCV_ARCH STREQUAL "ARM64")
-    list(APPEND HIGHGUI_LIBRARIES "comdlg32" "advapi32")
-  endif()
 elseif(HAVE_COCOA)
   set(OPENCV_HIGHGUI_BUILTIN_BACKEND "COCOA")
   add_definitions(-DHAVE_COCOA)
@@ -144,6 +138,16 @@ elseif(HAVE_COCOA)
   list(APPEND HIGHGUI_LIBRARIES "-framework Cocoa")
 endif()
 
+if(TARGET ocv.3rdparty.win32ui)
+  if("win32ui" IN_LIST HIGHGUI_PLUGIN_LIST OR HIGHGUI_PLUGIN_LIST STREQUAL "all")
+    ocv_create_builtin_highgui_plugin(opencv_highgui_win32 ocv.3rdparty.win32ui "window_w32.cpp")
+  elseif(NOT OPENCV_HIGHGUI_BUILTIN_BACKEND)
+    set(OPENCV_HIGHGUI_BUILTIN_BACKEND "WIN32UI")
+    list(APPEND highgui_srcs ${CMAKE_CURRENT_LIST_DIR}/src/window_w32.cpp)
+    list(APPEND tgts ocv.3rdparty.win32ui)
+  endif()
+endif()
+
 if(TARGET ocv.3rdparty.gtk3 OR TARGET ocv.3rdparty.gtk2)
   if(TARGET ocv.3rdparty.gtk3 AND NOT WITH_GTK_2_X)
     set(__gtk_dependency "ocv.3rdparty.gtk3")
@@ -214,7 +218,8 @@ endif()
 if(NOT OPENCV_HIGHGUI_BUILTIN_BACKEND)
   set(OPENCV_HIGHGUI_BUILTIN_BACKEND "NONE")
 endif()
-message(STATUS "highgui: using builtin backend: ${OPENCV_HIGHGUI_BUILTIN_BACKEND}")  # FIXIT: propagate to root CMake
+message(STATUS "highgui: using builtin backend: ${OPENCV_HIGHGUI_BUILTIN_BACKEND}")
+set(OPENCV_HIGHGUI_BUILTIN_BACKEND "${OPENCV_HIGHGUI_BUILTIN_BACKEND}" PARENT_SCOPE)  # informational
 
 if(TRUE)
   # these variables are set by 'ocv_append_build_options(HIGHGUI ...)'
diff --git a/modules/highgui/cmake/detect_gtk.cmake b/modules/highgui/cmake/detect_gtk.cmake
index cdc054fad0c5..c58246ac5414 100644
--- a/modules/highgui/cmake/detect_gtk.cmake
+++ b/modules/highgui/cmake/detect_gtk.cmake
@@ -6,8 +6,6 @@ if(WITH_GTK)
     if(HAVE_GTK3)
       ocv_add_external_target(gtk3 "${GTK3_INCLUDE_DIRS}" "${GTK3_LIBRARIES}" "HAVE_GTK3;HAVE_GTK")
       set(HAVE_GTK TRUE)
-      set(HAVE_GTK3 ${HAVE_GTK3} PARENT_SCOPE)
-      set(GTK3_VERSION "${GTK3_VERSION}" PARENT_SCOPE) # informational
     endif()
   endif()
   if((PROJECT_NAME STREQUAL "OpenCV" AND HIGHGUI_ENABLE_PLUGINS) OR NOT HAVE_GTK3)
@@ -19,8 +17,6 @@ if(WITH_GTK)
       else()
         ocv_add_external_target(gtk2 "${GTK2_INCLUDE_DIRS}" "${GTK2_LIBRARIES}" "HAVE_GTK2;HAVE_GTK")
         set(HAVE_GTK TRUE)
-        set(HAVE_GTK2 ${HAVE_GTK2} PARENT_SCOPE)
-        set(GTK2_VERSION "${GTK2_VERSION}" PARENT_SCOPE) # informational
       endif()
     endif()
   endif()
@@ -29,15 +25,11 @@ if(WITH_GTK)
     message(FATAL_ERROR "gthread not found. This library is required when building with GTK support")
   else()
     ocv_add_external_target(gthread "${GTHREAD_INCLUDE_DIRS}" "${GTHREAD_LIBRARIES}" "HAVE_GTHREAD")
-    set(HAVE_GTHREAD "${HAVE_GTHREAD}" PARENT_SCOPE) # informational
-    set(GTHREAD_VERSION "${GTHREAD_VERSION}" PARENT_SCOPE) # informational
   endif()
   if((WITH_OPENGL OR HAVE_OPENGL) AND HAVE_GTK2)
     ocv_check_modules(GTKGLEXT gtkglext-1.0)
     if(HAVE_GTKGLEXT)
       ocv_add_external_target(gtkglext "${GTKGLEXT_INCLUDE_DIRS}" "${GTKGLEXT_LIBRARIES}" "HAVE_GTKGLEXT")
-      set(HAVE_GTKGLEXT "${HAVE_GTKGLEXT}" PARENT_SCOPE) # informational
-      set(GTKGLEXT_VERSION "${GTKGLEXT_VERSION}" PARENT_SCOPE) # informational
     endif()
   endif()
 elseif(HAVE_GTK)
@@ -48,9 +40,6 @@ if(WITH_OPENGL AND HAVE_GTKGLEXT)
   find_package(OpenGL QUIET)
   if(OPENGL_FOUND)
     set(HAVE_OPENGL TRUE)
-    #set(HAVE_OPENGL ${HAVE_OPENGL} PARENT_SCOPE)
     ocv_add_external_target(gtk_opengl "${OPENGL_INCLUDE_DIRS}" "${OPENGL_LIBRARIES}" "HAVE_OPENGL")
   endif()
 endif()
-
-set(HAVE_GTK ${HAVE_GTK} PARENT_SCOPE)
diff --git a/modules/highgui/cmake/detect_win32ui.cmake b/modules/highgui/cmake/detect_win32ui.cmake
new file mode 100644
index 000000000000..c5e358ffa710
--- /dev/null
+++ b/modules/highgui/cmake/detect_win32ui.cmake
@@ -0,0 +1,15 @@
+#--- Win32 UI ---
+ocv_clear_vars(HAVE_WIN32UI)
+if(WITH_WIN32UI)
+  try_compile(HAVE_WIN32UI
+    "${CMAKE_CURRENT_BINARY_DIR}"
+    "${OpenCV_SOURCE_DIR}/cmake/checks/win32uitest.cpp"
+    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=user32;gdi32")
+  if(HAVE_WIN32UI)
+    set(__libs "user32" "gdi32")
+    if(OpenCV_ARCH STREQUAL "ARM64")
+      list(APPEND __libs "comdlg32" "advapi32")
+    endif()
+    ocv_add_external_target(win32ui "" "${__libs}" "HAVE_WIN32UI")
+  endif()
+endif()
diff --git a/modules/highgui/cmake/init.cmake b/modules/highgui/cmake/init.cmake
index 3b766b3758c0..2002ff0e9d70 100644
--- a/modules/highgui/cmake/init.cmake
+++ b/modules/highgui/cmake/init.cmake
@@ -25,26 +25,18 @@ endif()
 # Detect available dependencies
 #
 
-include(FindPkgConfig)
-
-# FIXIT: stop using PARENT_SCOPE in dependencies
-if(PROJECT_NAME STREQUAL "OpenCV")
-  macro(add_backend backend_id cond_var)
-    if(${cond_var})
-      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
-    endif()
-  endmacro()
-else()
-  function(add_backend backend_id cond_var)
-    if(${cond_var})
-      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
-    endif()
-  endfunction()
+if(NOT PROJECT_NAME STREQUAL "OpenCV")
+  include(FindPkgConfig)
 endif()
 
-add_backend("gtk" WITH_GTK)
+macro(add_backend backend_id cond_var)
+  if(${cond_var})
+    include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+  endif()
+endmacro()
 
-# TODO win32
+add_backend("gtk" WITH_GTK)
+add_backend("win32ui" WITH_WIN32UI)
 # TODO cocoa
 # TODO qt
 # TODO opengl
diff --git a/modules/highgui/src/backend.hpp b/modules/highgui/src/backend.hpp
index 14c88b238761..7c32846ce4a3 100644
--- a/modules/highgui/src/backend.hpp
+++ b/modules/highgui/src/backend.hpp
@@ -114,6 +114,10 @@ bool setUIBackend(const std::string& backendName);
 
 #ifndef BUILD_PLUGIN
 
+#ifdef HAVE_WIN32UI
+std::shared_ptr<UIBackend> createUIBackendWin32UI();
+#endif
+
 #ifdef HAVE_GTK
 std::shared_ptr<UIBackend> createUIBackendGTK();
 #endif
diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp
index 6ad5bce8b465..0d26b957ad71 100644
--- a/modules/highgui/src/precomp.hpp
+++ b/modules/highgui/src/precomp.hpp
@@ -67,7 +67,6 @@
 #include <string.h>
 #include <limits.h>
 #include <ctype.h>
-#include <assert.h>
 
 #if defined _WIN32 || defined WINCE
     #include <windows.h>
@@ -127,6 +126,13 @@ void cvSetPropTopmost_COCOA(const char* name, const bool topmost);
 double cvGetPropVsync_W32(const char* name);
 void cvSetPropVsync_W32(const char* name, const bool enabled);
 
+void setWindowTitle_W32(const cv::String& name, const cv::String& title);
+void setWindowTitle_GTK(const cv::String& name, const cv::String& title);
+void setWindowTitle_QT(const cv::String& name, const cv::String& title);
+void setWindowTitle_COCOA(const cv::String& name, const cv::String& title);
+
+int pollKey_W32();
+
 //for QT
 #if defined (HAVE_QT)
 CvRect cvGetWindowRect_QT(const char* name);
diff --git a/modules/highgui/src/registry.impl.hpp b/modules/highgui/src/registry.impl.hpp
index ccf81f928002..66693f1b07e0 100644
--- a/modules/highgui/src/registry.impl.hpp
+++ b/modules/highgui/src/registry.impl.hpp
@@ -50,6 +50,14 @@ std::vector<BackendInfo>& getBuiltinBackendsInfo()
 #elif defined(ENABLE_PLUGINS)
         DECLARE_DYNAMIC_BACKEND("QT")
 #endif
+#endif
+
+#ifdef _WIN32
+#ifdef HAVE_WIN32UI
+        DECLARE_STATIC_BACKEND("WIN32", createUIBackendWin32UI)
+#elif defined(ENABLE_PLUGINS)
+        DECLARE_DYNAMIC_BACKEND("WIN32")
+#endif
 #endif
     };
     return g_backends;
diff --git a/modules/highgui/src/window.cpp b/modules/highgui/src/window.cpp
index 56c1456a5d95..d9481de6da24 100644
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -586,6 +586,46 @@ void cv::moveWindow( const String& winname, int x, int y )
 #endif
 }
 
+void cv::setWindowTitle(const String& winname, const String& title)
+{
+    CV_TRACE_FUNCTION();
+
+    {
+        cv::AutoLock lock(cv::getWindowMutex());
+        auto window = findWindow_(winname);
+        if (window)
+        {
+            return window->setTitle(title);
+        }
+    }
+
+#if defined(OPENCV_HIGHGUI_WITHOUT_BUILTIN_BACKEND) && defined(ENABLE_PLUGINS)
+    auto backend = getCurrentUIBackend();
+    if (backend)
+    {
+        CV_LOG_WARNING(NULL, "Can't find window with name: '" << winname << "'. Do nothing");
+        CV_NOT_FOUND_DEPRECATION;
+    }
+    else
+    {
+        CV_LOG_WARNING(NULL, "No UI backends available. Use OPENCV_LOG_LEVEL=DEBUG for investigation");
+    }
+    return;
+#elif defined(HAVE_WIN32UI)
+    return setWindowTitle_W32(winname, title);
+#elif defined (HAVE_GTK)
+    return setWindowTitle_GTK(winname, title);
+#elif defined (HAVE_QT)
+    return setWindowTitle_QT(winname, title);
+#elif defined (HAVE_COCOA)
+    return setWindowTitle_COCOA(winname, title);
+#else
+    CV_Error(Error::StsNotImplemented, "The function is not implemented. "
+        "Rebuild the library with Windows, GTK+ 2.x or Cocoa support. "
+        "If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script");
+#endif
+}
+
 void cv::setWindowProperty(const String& winname, int prop_id, double prop_value)
 {
     CV_TRACE_FUNCTION();
@@ -630,9 +670,9 @@ int cv::waitKey(int delay)
     return (code != -1) ? (code & 0xff) : -1;
 }
 
-#if defined(HAVE_QT) || (defined (WINRT) && !defined (WINRT_8_0)) || \
-    !defined(HAVE_WIN32UI) && (defined(HAVE_GTK) || defined(HAVE_COCOA))
-// pollKey() fallback implementation
+/*
+ * process until queue is empty but don't wait.
+ */
 int cv::pollKey()
 {
     CV_TRACE_FUNCTION();
@@ -646,12 +686,13 @@ int cv::pollKey()
         }
     }
 
+#if defined(HAVE_WIN32UI)
+    return pollKey_W32();
+#else
     // fallback. please implement a proper polling function
     return cvWaitKey(1);
-}
-#elif defined(HAVE_WIN32UI)
-// pollKey() implemented in window_w32.cpp
 #endif
+}
 
 int cv::createTrackbar(const String& trackbarName, const String& winName,
                    int* value, int count, TrackbarCallback callback,
@@ -949,7 +990,7 @@ void cv::imshow( const String& winname, InputArray _img )
         auto backend = getCurrentUIBackend();
         if (backend)
         {
-            auto window = backend->createWindow(winname, WINDOW_NORMAL);
+            auto window = backend->createWindow(winname, WINDOW_AUTOSIZE);
             if (!window)
             {
                 CV_LOG_ERROR(NULL, "OpenCV/UI: Can't create window: '" << winname << "'");
@@ -1203,13 +1244,6 @@ int cv::createButton(const String&, ButtonCallback, void*, int , bool )
 // version with a more capable one without a need to recompile dependent
 // applications or libraries.
 
-void cv::setWindowTitle(const String&, const String&)
-{
-    CV_Error(Error::StsNotImplemented, "The function is not implemented. "
-        "Rebuild the library with Windows, GTK+ 2.x or Cocoa support. "
-        "If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script");
-}
-
 #define CV_NO_GUI_ERROR(funcname) \
     cv::error(cv::Error::StsError, \
     "The function is not implemented. " \
@@ -1360,11 +1394,6 @@ CV_IMPL int cvCreateButton(const char*, void (*)(int, void*), void*, int, int)
     CV_NO_GUI_ERROR("cvCreateButton");
 }
 
-int cv::pollKey()
-{
-    CV_NO_GUI_ERROR("cv::pollKey()");
-}
-
 #endif
 
 /* End of file. */
diff --git a/modules/highgui/src/window_QT.cpp b/modules/highgui/src/window_QT.cpp
index 60d7d69a5979..a81814bb79b1 100644
--- a/modules/highgui/src/window_QT.cpp
+++ b/modules/highgui/src/window_QT.cpp
@@ -63,6 +63,7 @@
     #endif
 #endif
 
+using namespace cv;
 
 //Static and global first
 static GuiReceiver *guiMainThread = NULL;
@@ -197,7 +198,7 @@ void cvSetPropWindow_QT(const char* name,double prop_value)
         Q_ARG(double, prop_value));
 }
 
-void cv::setWindowTitle(const String& winname, const String& title)
+void setWindowTitle_QT(const String& winname, const String& title)
 {
     if (!guiMainThread)
         CV_Error(Error::StsNullPtr, "NULL guiReceiver (please create a window)");
@@ -2882,18 +2883,19 @@ inline bool DefaultViewPort::isSameSize(IplImage* img1, IplImage* img2)
 void DefaultViewPort::controlImagePosition()
 {
     qreal left, top, right, bottom;
+    qreal factor = 1.0 / param_matrixWorld.m11();
 
     //after check top-left, bottom right corner to avoid getting "out" during zoom/panning
     param_matrixWorld.map(0,0,&left,&top);
 
     if (left > 0)
     {
-        param_matrixWorld.translate(-left,0);
+        param_matrixWorld.translate(-left * factor, 0);
         left = 0;
     }
     if (top > 0)
     {
-        param_matrixWorld.translate(0,-top);
+        param_matrixWorld.translate(0, -top * factor);
         top = 0;
     }
     //-------
@@ -2902,12 +2904,12 @@ void DefaultViewPort::controlImagePosition()
     param_matrixWorld.map(sizeImage.width(),sizeImage.height(),&right,&bottom);
     if (right < sizeImage.width())
     {
-        param_matrixWorld.translate(sizeImage.width()-right,0);
+        param_matrixWorld.translate((sizeImage.width() - right) * factor, 0);
         right = sizeImage.width();
     }
     if (bottom < sizeImage.height())
     {
-        param_matrixWorld.translate(0,sizeImage.height()-bottom);
+        param_matrixWorld.translate(0, (sizeImage.height() - bottom) * factor);
         bottom = sizeImage.height();
     }
 
diff --git a/modules/highgui/src/window_cocoa.mm b/modules/highgui/src/window_cocoa.mm
index 29a0278c982e..e8e903440675 100644
--- a/modules/highgui/src/window_cocoa.mm
+++ b/modules/highgui/src/window_cocoa.mm
@@ -795,18 +795,18 @@ void cvSetPropTopmost_COCOA( const char* name, const bool topmost )
     __END__;
 }
 
-void cv::setWindowTitle(const String& winname, const String& title)
+void setWindowTitle_COCOA(const cv::String& winname, const cv::String& title)
 {
     CVWindow *window = cvGetWindow(winname.c_str());
 
     if (window == NULL)
     {
-        namedWindow(winname);
+        cv::namedWindow(winname);
         window = cvGetWindow(winname.c_str());
     }
 
     if (window == NULL)
-        CV_Error(Error::StsNullPtr, "NULL window");
+        CV_Error(cv::Error::StsNullPtr, "NULL window");
 
     NSAutoreleasePool* localpool = [[NSAutoreleasePool alloc] init];
 
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index efa3fbd96f56..3428586ea344 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -364,7 +364,7 @@ static void cvImageWidget_set_size(GtkWidget * widget, int max_width, int max_he
 
 
     }
-    assert( image_widget->scaled_image );
+    CV_Assert(image_widget->scaled_image);
 }
 
 static void
@@ -849,7 +849,7 @@ static bool setModeWindow_(const std::shared_ptr<CvWindow>& window, int mode)
     return false;
 }
 
-void cv::setWindowTitle(const String& winname, const String& title)
+void setWindowTitle_GTK(const String& winname, const String& title)
 {
     CV_LOCK_MUTEX();
 
@@ -2023,6 +2023,7 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
                (unsigned)pt.y < (unsigned)(image_widget->original_image->height)
             ))
         {
+            state &= gtk_accelerator_get_default_mod_mask();
             flags |= BIT_MAP(state, GDK_SHIFT_MASK,   CV_EVENT_FLAG_SHIFTKEY) |
                 BIT_MAP(state, GDK_CONTROL_MASK, CV_EVENT_FLAG_CTRLKEY)  |
                 BIT_MAP(state, GDK_MOD1_MASK,    CV_EVENT_FLAG_ALTKEY)   |
diff --git a/modules/highgui/src/window_w32.cpp b/modules/highgui/src/window_w32.cpp
index c4f2ddd2a603..716af1094c29 100644
--- a/modules/highgui/src/window_w32.cpp
+++ b/modules/highgui/src/window_w32.cpp
@@ -41,12 +41,17 @@
 
 #include "precomp.hpp"
 
+#ifdef HAVE_WIN32UI
+
+#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/utils/trace.hpp>
+
+#include "backend.hpp"
+
 using namespace cv;
 
 #include <windowsx.h> // required for GET_X_LPARAM() and GET_Y_LPARAM() macros
 
-#if defined _WIN32
-
 #ifdef __GNUC__
 #  pragma GCC diagnostic ignored "-Wmissing-declarations"
 #endif
@@ -60,14 +65,12 @@ using namespace cv;
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-#include <assert.h>
 
 #ifdef HAVE_OPENGL
 #include <memory>
 #include <algorithm>
 #include <vector>
 #include <functional>
-#include "opencv2/highgui.hpp"
 #include <GL/gl.h>
 #include "opencv2/core/opengl.hpp"
 #endif
@@ -78,7 +81,7 @@ static const char* trackbar_text =
 #if defined _M_X64 || defined __x86_64 || defined _M_ARM64
 
 #define icvGetWindowLongPtr GetWindowLongPtr
-#define icvSetWindowLongPtr( hwnd, id, ptr ) SetWindowLongPtr( hwnd, id, (LONG_PTR)(ptr) )
+#define icvSetWindowLongPtr(hwnd, id, ptr) SetWindowLongPtr(hwnd, id, (LONG_PTR)(ptr))
 #define icvGetClassLongPtr  GetClassLongPtr
 
 #define CV_USERDATA GWLP_USERDATA
@@ -89,7 +92,7 @@ static const char* trackbar_text =
 #else
 
 #define icvGetWindowLongPtr GetWindowLong
-#define icvSetWindowLongPtr( hwnd, id, ptr ) SetWindowLong( hwnd, id, (size_t)ptr )
+#define icvSetWindowLongPtr(hwnd, id, ptr) SetWindowLong(hwnd, id, (size_t)ptr)
 #define icvGetClassLongPtr GetClassLong
 
 #define CV_USERDATA GWL_USERDATA
@@ -116,13 +119,13 @@ static inline void mingw_strcat_s(char *dest, size_t destsz, const char *src){
 #define strcat_s mingw_strcat_s
 #endif
 
-static void FillBitmapInfo( BITMAPINFO* bmi, int width, int height, int bpp, int origin )
+static void FillBitmapInfo(BITMAPINFO* bmi, int width, int height, int bpp, int origin)
 {
-    assert( bmi && width >= 0 && height >= 0 && (bpp == 8 || bpp == 24 || bpp == 32));
+    CV_Assert(bmi && width >= 0 && height >= 0 && (bpp == 8 || bpp == 24 || bpp == 32));
 
     BITMAPINFOHEADER* bmih = &(bmi->bmiHeader);
 
-    memset( bmih, 0, sizeof(*bmih));
+    memset(bmih, 0, sizeof(*bmih));
     bmih->biSize = sizeof(BITMAPINFOHEADER);
     bmih->biWidth = width;
     bmih->biHeight = origin ? abs(height) : -abs(height);
@@ -130,11 +133,11 @@ static void FillBitmapInfo( BITMAPINFO* bmi, int width, int height, int bpp, int
     bmih->biBitCount = (unsigned short)bpp;
     bmih->biCompression = BI_RGB;
 
-    if( bpp == 8 )
+    if (bpp == 8)
     {
         RGBQUAD* palette = bmi->bmiColors;
         int i;
-        for( i = 0; i < 256; i++ )
+        for (i = 0; i < 256; i++)
         {
             palette[i].rgbBlue = palette[i].rgbGreen = palette[i].rgbRed = (BYTE)i;
             palette[i].rgbReserved = 0;
@@ -144,68 +147,91 @@ static void FillBitmapInfo( BITMAPINFO* bmi, int width, int height, int bpp, int
 
 struct CvWindow;
 
-typedef struct CvTrackbar
+struct CvTrackbar : public std::enable_shared_from_this<CvTrackbar>
 {
+    CvTrackbar(CvWindow& window, const std::string& name_)
+        : signature(CV_TRACKBAR_MAGIC_VAL)
+        , name(name_)
+        , parent(&window)
+    {
+        // nothing
+    }
+    ~CvTrackbar()
+    {
+        signature = -1;
+    }
+
     int signature;
-    HWND hwnd;
-    char* name;
-    CvTrackbar* next;
-    CvWindow* parent;
-    HWND buddy;
-    int* data;
-    int pos;
-    int maxval;
-    int minval;
-    void (*notify)(int);
-    void (*notify2)(int, void*);
-    void* userdata;
-    int id;
-}
-CvTrackbar;
+    HWND hwnd = 0;
+    std::string name;
+    CvWindow* parent;  // TODO weak_ptr
+    HWND buddy = 0;
+    int* data = nullptr;
+    int pos = 0;
+    int maxval = 0;
+    int minval = 0;
+    void (*notify)(int) = nullptr;  // deprecated
+    void (*notify2)(int, void*) = nullptr;  // deprecated
+    TrackbarCallback onChangeCallback = nullptr;
+    void* userdata = nullptr;
+    int id = -1;
+};
 
 
-typedef struct CvWindow
+struct CvWindow : public std::enable_shared_from_this<CvWindow>
 {
+    CvWindow(const std::string& name_)
+        : signature(CV_WINDOW_MAGIC_VAL)
+        , name(name_)
+    {
+        // nothing
+    }
+
+    ~CvWindow()
+    {
+        signature = -1;
+    }
+
+    void destroy();
+
     int signature;
-    HWND hwnd;
-    char* name;
-    CvWindow* prev;
-    CvWindow* next;
-    HWND frame;
+    cv::Mutex mutex;
+    HWND hwnd = 0;
+    std::string name;
+    HWND frame = 0;
 
-    HDC dc;
-    HGDIOBJ image;
-    int last_key;
-    int flags;
-    int status;//0 normal, 1 fullscreen (YV)
+    HDC dc = 0;
+    HGDIOBJ image = 0;
+    int last_key = 0;
+    int flags = 0;
+    int status = 0;//0 normal, 1 fullscreen (YV)
 
-    CvMouseCallback on_mouse;
-    void* on_mouse_param;
+    CvMouseCallback on_mouse = nullptr;
+    void* on_mouse_param = nullptr;
 
     struct
     {
-        HWND toolbar;
-        int pos;
-        int rows;
-        WNDPROC toolBarProc;
-        CvTrackbar* first;
+        HWND toolbar = 0;
+        int pos = 0;
+        int rows = 0;
+        WNDPROC toolBarProc = nullptr;
+        std::vector< std::shared_ptr<CvTrackbar> > trackbars;
     }
     toolbar;
 
-    int width;
-    int height;
+    int width = -1;
+    int height = -1;
 
     // OpenGL support
 
 #ifdef HAVE_OPENGL
-    bool useGl;
-    HGLRC hGLRC;
+    bool useGl = false;
+    HGLRC hGLRC = 0;
 
-    CvOpenGlDrawCallback glDrawCallback;
-    void* glDrawData;
+    CvOpenGlDrawCallback glDrawCallback = nullptr;
+    void* glDrawData = nullptr;
 #endif
-}
-CvWindow;
+};
 
 #define HG_BUDDY_WIDTH  130
 
@@ -221,19 +247,50 @@ CvWindow;
     #define TBM_GETTOOLTIPS  (WM_USER + 30)
 #endif
 
-static LRESULT CALLBACK HighGUIProc(  HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
-static LRESULT CALLBACK WindowProc(  HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
-static LRESULT CALLBACK MainWindowProc(  HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
-static void icvUpdateWindowPos( CvWindow* window );
+static
+std::vector< std::shared_ptr<CvWindow> >& getWindowsList()
+{
+    static std::vector< std::shared_ptr<CvWindow> > g_windows;
+    return g_windows;
+}
+
 
-static CvWindow* hg_windows = 0;
+// Mutex must be locked
+static
+std::shared_ptr<CvWindow> icvFindWindowByName(const std::string& name)
+{
+    auto& g_windows = getWindowsList();
+    for (auto it = g_windows.begin(); it != g_windows.end(); ++it)
+    {
+        auto window = *it;
+        if (!window)
+            continue;
+        if (window->name == name)
+            return window;
+    }
+    return std::shared_ptr<CvWindow>();
+}
+
+static inline
+std::shared_ptr<CvWindow> icvFindWindowByName(const char* name)
+{
+    CV_Assert(name);
+    return icvFindWindowByName(std::string(name));
+}
+
+
+
+static LRESULT CALLBACK HighGUIProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
+static LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
+static LRESULT CALLBACK MainWindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
+static void icvUpdateWindowPos(CvWindow& window);
 
 typedef int (CV_CDECL * CvWin32WindowCallback)(HWND, UINT, WPARAM, LPARAM, int*);
 static CvWin32WindowCallback hg_on_preprocess = 0, hg_on_postprocess = 0;
 static HINSTANCE hg_hinstance = 0;
 
-static const char* highGUIclassName = "HighGUI class";
-static const char* mainHighGUIclassName = "Main HighGUI class";
+static const char* const highGUIclassName = "HighGUI class";
+static const char* const mainHighGUIclassName = "Main HighGUI class";
 
 static void icvCleanupHighgui()
 {
@@ -242,15 +299,15 @@ static void icvCleanupHighgui()
     UnregisterClass(mainHighGUIclassName, hg_hinstance);
 }
 
-CV_IMPL int cvInitSystem( int, char** )
+CV_IMPL int cvInitSystem(int, char**)
 {
     static int wasInitialized = 0;
 
     // check initialization status
-    if( !wasInitialized )
+    if (!wasInitialized)
     {
-        // Initialize the storage
-        hg_windows = 0;
+        (void)getWindowMutex();  // force mutex initialization
+        (void)getWindowsList();  // Initialize the storage
 
         // Register the class
         WNDCLASS wndc;
@@ -262,7 +319,7 @@ CV_IMPL int cvInitSystem( int, char** )
         wndc.lpszClassName = highGUIclassName;
         wndc.lpszMenuName = highGUIclassName;
         wndc.hIcon = LoadIcon(0, IDI_APPLICATION);
-        wndc.hCursor = (HCURSOR)LoadCursor(0, (LPSTR)(size_t)IDC_CROSS );
+        wndc.hCursor = (HCURSOR)LoadCursor(0, (LPSTR)(size_t)IDC_CROSS);
         wndc.hbrBackground = (HBRUSH)GetStockObject(DKGRAY_BRUSH);
 
         RegisterClass(&wndc);
@@ -273,12 +330,12 @@ CV_IMPL int cvInitSystem( int, char** )
         wndc.lpfnWndProc = MainWindowProc;
 
         RegisterClass(&wndc);
-        atexit( icvCleanupHighgui );
+        atexit(icvCleanupHighgui);
 
         wasInitialized = 1;
     }
 
-    setlocale(LC_NUMERIC,"C");
+    setlocale(LC_NUMERIC,"C");  // FIXIT must be removed
 
     return 0;
 }
@@ -287,50 +344,58 @@ CV_IMPL int cvStartWindowThread(){
     return 0;
 }
 
-static CvWindow* icvFindWindowByName( const char* name )
-{
-    CvWindow* window = hg_windows;
-
-    for( ; window != 0 && strcmp( name, window->name) != 0; window = window->next )
-        ;
-
-    return window;
-}
-
 
-static CvWindow* icvWindowByHWND( HWND hwnd )
+static std::shared_ptr<CvWindow> icvWindowByHWND(HWND hwnd)
 {
-    CvWindow* window = (CvWindow*)icvGetWindowLongPtr( hwnd, CV_USERDATA );
-    return window != 0 && hg_windows != 0 &&
+    AutoLock lock(getWindowMutex());
+    CvWindow* window = (CvWindow*)icvGetWindowLongPtr(hwnd, CV_USERDATA);
+    window = window != 0 &&
            window->signature == CV_WINDOW_MAGIC_VAL ? window : 0;
+    if (window)
+    {
+        return window->shared_from_this();
+    }
+    else
+    {
+        return std::shared_ptr<CvWindow>();
+    }
 }
 
 
-static CvTrackbar* icvTrackbarByHWND( HWND hwnd )
+static std::shared_ptr<CvTrackbar> icvTrackbarByHWND(HWND hwnd)
 {
-    CvTrackbar* trackbar = (CvTrackbar*)icvGetWindowLongPtr( hwnd, CV_USERDATA );
-    return trackbar != 0 && trackbar->signature == CV_TRACKBAR_MAGIC_VAL &&
+    AutoLock lock(getWindowMutex());
+    CvTrackbar* trackbar = (CvTrackbar*)icvGetWindowLongPtr(hwnd, CV_USERDATA);
+    trackbar = trackbar != 0 && trackbar->signature == CV_TRACKBAR_MAGIC_VAL &&
            trackbar->hwnd == hwnd ? trackbar : 0;
+    if (trackbar)
+    {
+        return trackbar->shared_from_this();
+    }
+    else
+    {
+        return std::shared_ptr<CvTrackbar>();
+    }
 }
 
 
-static const char* icvWindowPosRootKey = "Software\\OpenCV\\HighGUI\\Windows\\";
+static const char* const icvWindowPosRootKey = "Software\\OpenCV\\HighGUI\\Windows\\";
 
 // Window positions saving/loading added by Philip Gruebele.
 //<a href="mailto:pgruebele@cox.net">pgruebele@cox.net</a>
 // Restores the window position from the registry saved position.
 static void
-icvLoadWindowPos( const char* name, CvRect& rect )
+icvLoadWindowPos(const char* name, CvRect& rect)
 {
     HKEY hkey;
     char szKey[1024];
-    strcpy_s( szKey, 1024, icvWindowPosRootKey );
-    strcat_s( szKey, 1024, name );
+    strcpy_s(szKey, 1024, icvWindowPosRootKey);
+    strcat_s(szKey, 1024, name);
 
     rect.x = rect.y = CW_USEDEFAULT;
     rect.width = rect.height = 320;
 
-    if( RegOpenKeyEx(HKEY_CURRENT_USER,szKey,0,KEY_QUERY_VALUE,&hkey) == ERROR_SUCCESS )
+    if (RegOpenKeyEx(HKEY_CURRENT_USER,szKey,0,KEY_QUERY_VALUE,&hkey) == ERROR_SUCCESS)
     {
         // Yes we are installed.
         DWORD dwType = 0;
@@ -379,16 +444,16 @@ icvLoadWindowPos( const char* name, CvRect& rect )
 //<a href="mailto:pgruebele@cox.net">pgruebele@cox.net</a>
 // philipg.  Saves the window position in the registry
 static void
-icvSaveWindowPos( const char* name, CvRect rect )
+icvSaveWindowPos(const char* name, CvRect rect)
 {
     static const DWORD MAX_RECORD_COUNT = 100;
     HKEY hkey;
     char szKey[1024];
     char rootKey[1024];
-    strcpy_s( szKey, 1024, icvWindowPosRootKey );
-    strcat_s( szKey, 1024, name );
+    strcpy_s(szKey, 1024, icvWindowPosRootKey);
+    strcat_s(szKey, 1024, name);
 
-    if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS )
+    if (RegOpenKeyEx(HKEY_CURRENT_USER,szKey,0,KEY_READ,&hkey) != ERROR_SUCCESS)
     {
         HKEY hroot;
         DWORD count = 0;
@@ -396,40 +461,40 @@ icvSaveWindowPos( const char* name, CvRect rect )
         char oldestKey[1024];
         char currentKey[1024];
 
-        strcpy_s( rootKey, 1024, icvWindowPosRootKey );
+        strcpy_s(rootKey, 1024, icvWindowPosRootKey);
         rootKey[strlen(rootKey)-1] = '\0';
-        if( RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS )
-            //RegOpenKeyEx( HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS )
+        if (RegCreateKeyEx(HKEY_CURRENT_USER, rootKey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_READ+KEY_WRITE, 0, &hroot, NULL) != ERROR_SUCCESS)
+            //RegOpenKeyEx(HKEY_CURRENT_USER,rootKey,0,KEY_READ,&hroot) != ERROR_SUCCESS)
             return;
 
         for(;;)
         {
             DWORD csize = sizeof(currentKey);
             FILETIME accesstime = { 0, 0 };
-            LONG code = RegEnumKeyEx( hroot, count, currentKey, &csize, NULL, NULL, NULL, &accesstime );
-            if( code != ERROR_SUCCESS && code != ERROR_MORE_DATA )
+            LONG code = RegEnumKeyEx(hroot, count, currentKey, &csize, NULL, NULL, NULL, &accesstime);
+            if (code != ERROR_SUCCESS && code != ERROR_MORE_DATA)
                 break;
             count++;
-            if( oldestTime.dwHighDateTime > accesstime.dwHighDateTime ||
+            if (oldestTime.dwHighDateTime > accesstime.dwHighDateTime ||
                 (oldestTime.dwHighDateTime == accesstime.dwHighDateTime &&
-                oldestTime.dwLowDateTime > accesstime.dwLowDateTime) )
+                oldestTime.dwLowDateTime > accesstime.dwLowDateTime))
             {
                 oldestTime = accesstime;
-                strcpy_s( oldestKey, 1024, currentKey );
+                strcpy_s(oldestKey, 1024, currentKey);
             }
         }
 
-        if( count >= MAX_RECORD_COUNT )
-            RegDeleteKey( hroot, oldestKey );
-        RegCloseKey( hroot );
+        if (count >= MAX_RECORD_COUNT)
+            RegDeleteKey(hroot, oldestKey);
+        RegCloseKey(hroot);
 
-        if( RegCreateKeyEx(HKEY_CURRENT_USER,szKey,0,NULL,REG_OPTION_NON_VOLATILE, KEY_WRITE, 0, &hkey, NULL) != ERROR_SUCCESS )
+        if (RegCreateKeyEx(HKEY_CURRENT_USER,szKey,0,NULL,REG_OPTION_NON_VOLATILE, KEY_WRITE, 0, &hkey, NULL) != ERROR_SUCCESS)
             return;
     }
     else
     {
-        RegCloseKey( hkey );
-        if( RegOpenKeyEx( HKEY_CURRENT_USER,szKey,0,KEY_WRITE,&hkey) != ERROR_SUCCESS )
+        RegCloseKey(hkey);
+        if (RegOpenKeyEx(HKEY_CURRENT_USER,szKey,0,KEY_WRITE,&hkey) != ERROR_SUCCESS)
             return;
     }
 
@@ -440,96 +505,101 @@ icvSaveWindowPos( const char* name, CvRect rect )
     RegCloseKey(hkey);
 }
 
+static Rect getImageRect_(CvWindow& window);
+
 CvRect cvGetWindowRect_W32(const char* name)
 {
-    RECT rect = { 0 };
-    CvRect result = cvRect(-1, -1, -1, -1);
-
-    CV_FUNCNAME( "cvGetWindowRect_W32" );
+    CV_FUNCNAME("cvGetWindowRect_W32");
 
-    __BEGIN__;
-
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
-    window = icvFindWindowByName( name );
+        CV_Error(Error::StsNullPtr, "NULL name string");
+
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT; // keep silence here
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    GetClientRect(window->hwnd, &rect);
-    {
+    Rect r = getImageRect_(*window);
+
+    CvRect result = cvRect(r.x, r.y, r.width, r.height);
+    return result;
+}
+
+static Rect getImageRect_(CvWindow& window)
+{
+    RECT rect = { 0 };
+    GetClientRect(window.hwnd, &rect);
     POINT pt = {rect.left, rect.top};
-    ClientToScreen(window->hwnd, &pt);
-    result = cvRect(pt.x, pt.y, rect.right - rect.left, rect.bottom - rect.top);
-    }
-    __END__;
+    ClientToScreen(window.hwnd, &pt);
+    Rect result(pt.x, pt.y, rect.right - rect.left, rect.bottom - rect.top);
     return result;
 }
 
 double cvGetModeWindow_W32(const char* name)//YV
 {
-    double result = -1;
+    CV_FUNCNAME("cvGetModeWindow_W32");
 
-    CV_FUNCNAME( "cvGetModeWindow_W32" );
-
-    __BEGIN__;
-
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT; // keep silence here
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    result = window->status;
-
-    __END__;
-    return result;
+    return window->status;
 }
 
-void cvSetModeWindow_W32( const char* name, double prop_value)//Yannick Verdie
+static bool setModeWindow_(CvWindow& window, int mode);
+
+void cvSetModeWindow_W32(const char* name, double prop_value)//Yannick Verdie
 {
-    CV_FUNCNAME( "cvSetModeWindow_W32" );
+    CV_FUNCNAME("cvSetModeWindow_W32");
+
+    AutoLock lock(getWindowMutex());
 
-    __BEGIN__;
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    CvWindow* window;
+    auto window = icvFindWindowByName(name);
+    if (!window)
+         CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    if(!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+    (void)setModeWindow_(*window, (int)prop_value);
+}
 
-    window = icvFindWindowByName( name );
-    if( !window )
-        CV_ERROR( CV_StsNullPtr, "NULL window" );
+static bool setModeWindow_(CvWindow& window, int mode)
+{
+    if (window.flags & CV_WINDOW_AUTOSIZE)//if the flag CV_WINDOW_AUTOSIZE is set
+        return false;
 
-    if(window->flags & CV_WINDOW_AUTOSIZE)//if the flag CV_WINDOW_AUTOSIZE is set
-        EXIT;
+    if (window.status == mode)
+        return true;
 
     {
-        DWORD dwStyle = (DWORD)GetWindowLongPtr(window->frame, GWL_STYLE);
+        DWORD dwStyle = (DWORD)GetWindowLongPtr(window.frame, GWL_STYLE);
         CvRect position;
 
-        if (window->status==CV_WINDOW_FULLSCREEN && prop_value==CV_WINDOW_NORMAL)
+        if (window.status == CV_WINDOW_FULLSCREEN && mode == CV_WINDOW_NORMAL)
         {
-            icvLoadWindowPos(window->name,position );
-            SetWindowLongPtr(window->frame, GWL_STYLE, dwStyle | WS_CAPTION | WS_THICKFRAME);
+            icvLoadWindowPos(window.name.c_str(), position);
+            SetWindowLongPtr(window.frame, GWL_STYLE, dwStyle | WS_CAPTION | WS_THICKFRAME);
 
-            SetWindowPos(window->frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
-            window->status=CV_WINDOW_NORMAL;
+            SetWindowPos(window.frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
+            window.status=CV_WINDOW_NORMAL;
 
-            EXIT;
+            return true;
         }
 
-        if (window->status==CV_WINDOW_NORMAL && prop_value==CV_WINDOW_FULLSCREEN)
+        if (window.status == CV_WINDOW_NORMAL && mode == CV_WINDOW_FULLSCREEN)
         {
             //save dimension
             RECT rect = { 0 };
-            GetWindowRect(window->frame, &rect);
-            CvRect RectCV = cvRect(rect.left, rect.top,rect.right - rect.left, rect.bottom - rect.top);
-            icvSaveWindowPos(window->name,RectCV );
+            GetWindowRect(window.frame, &rect);
+            CvRect rectCV = cvRect(rect.left, rect.top,rect.right - rect.left, rect.bottom - rect.top);
+            icvSaveWindowPos(window.name.c_str(), rectCV);
 
             //Look at coordinate for fullscreen
             HMONITOR hMonitor;
@@ -542,60 +612,75 @@ void cvSetModeWindow_W32( const char* name, double prop_value)//Yannick Verdie
             //fullscreen
             position.x=mi.rcMonitor.left;position.y=mi.rcMonitor.top;
             position.width=mi.rcMonitor.right - mi.rcMonitor.left;position.height=mi.rcMonitor.bottom - mi.rcMonitor.top;
-            SetWindowLongPtr(window->frame, GWL_STYLE, dwStyle & ~WS_CAPTION & ~WS_THICKFRAME);
+            SetWindowLongPtr(window.frame, GWL_STYLE, dwStyle & ~WS_CAPTION & ~WS_THICKFRAME);
 
-            SetWindowPos(window->frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
-            window->status=CV_WINDOW_FULLSCREEN;
+            SetWindowPos(window.frame, HWND_TOP, position.x, position.y , position.width,position.height, SWP_NOZORDER | SWP_FRAMECHANGED);
+            window.status=CV_WINDOW_FULLSCREEN;
 
-            EXIT;
+            return true;
         }
     }
 
-    __END__;
+    return false;
 }
 
+static double getPropTopmost_(CvWindow& window);
+
 double cvGetPropTopmost_W32(const char* name)
 {
-    double result = -1;
-
     CV_Assert(name);
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error(Error::StsNullPtr, "NULL window");
 
-    LONG style = GetWindowLongA(window->frame, GWL_EXSTYLE); // -20
+    return getPropTopmost_(*window);
+}
+
+static double getPropTopmost_(CvWindow& window)
+{
+    LONG style = GetWindowLongA(window.frame, GWL_EXSTYLE); // -20
     if (!style)
     {
         std::ostringstream errorMsg;
-        errorMsg << "window(" << name << "): failed to retrieve extended window style using GetWindowLongA(); error code: " << GetLastError();
-        CV_Error(Error::StsError, errorMsg.str().c_str());
+        errorMsg << "window(" << window.name << "): failed to retrieve extended window style using GetWindowLongA(); error code: " << GetLastError();
+        CV_Error(Error::StsError, errorMsg.str());
     }
 
-    result = (style & WS_EX_TOPMOST) == WS_EX_TOPMOST;
-
-    return result;
+    bool result = (style & WS_EX_TOPMOST) == WS_EX_TOPMOST;
+    return result ? 1.0 : 0.0;
 }
 
+static bool setPropTopmost_(CvWindow& window, bool topmost);
+
 void cvSetPropTopmost_W32(const char* name, const bool topmost)
 {
     CV_Assert(name);
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error(Error::StsNullPtr, "NULL window");
 
+    (void)setPropTopmost_(*window, topmost);
+}
+
+static bool setPropTopmost_(CvWindow& window, bool topmost)
+{
     HWND flag    = topmost ? HWND_TOPMOST : HWND_TOP;
-    BOOL success = SetWindowPos(window->frame, flag, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE);
+    BOOL success = SetWindowPos(window.frame, flag, 0, 0, 0, 0, SWP_NOMOVE | SWP_NOSIZE);
 
     if (!success)
     {
         std::ostringstream errorMsg;
-        errorMsg << "window(" << name << "): error reported by SetWindowPos(" << (topmost ? "HWND_TOPMOST" : "HWND_TOP") << "), error code:  " << GetLastError();
-        CV_Error(Error::StsError, errorMsg.str().c_str());
+        errorMsg << "window(" << window.name << "): error reported by SetWindowPos(" << (topmost ? "HWND_TOPMOST" : "HWND_TOP") << "), error code:  " << GetLastError();
+        CV_Error(Error::StsError, errorMsg.str());
+        return false;
     }
+    return true;
 }
 
+static double getPropVsync_(CvWindow& window);
+
 double cvGetPropVsync_W32(const char* name)
 {
 #ifndef HAVE_OPENGL
@@ -605,40 +690,53 @@ double cvGetPropVsync_W32(const char* name)
     if (!name)
         CV_Error(Error::StsNullPtr, "'name' argument must not be NULL");
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error_(Error::StsBadArg, ("there is no window named '%s'", name));
 
+    double result = getPropVsync_(*window);
+    return cvIsNaN(result) ? -1.0 : result;
+#endif
+}
+
+static double getPropVsync_(CvWindow& window)
+{
+#ifndef HAVE_OPENGL
+    CV_UNUSED(window);
+    CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
+#else
     // https://www.khronos.org/opengl/wiki/Swap_Interval
     // https://www.khronos.org/registry/OpenGL/extensions/EXT/WGL_EXT_extensions_string.txt
     // https://www.khronos.org/registry/OpenGL/extensions/EXT/WGL_EXT_swap_control.txt
 
-    if (!wglMakeCurrent(window->dc, window->hGLRC))
+    if (!wglMakeCurrent(window.dc, window.hGLRC))
         CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 
     typedef const char* (APIENTRY* PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
     PFNWGLGETEXTENSIONSSTRINGEXTPROC wglGetExtensionsString = NULL;
     wglGetExtensionsString = (PFNWGLGETEXTENSIONSSTRINGEXTPROC)wglGetProcAddress("wglGetExtensionsStringEXT");
     if (wglGetExtensionsString == NULL)
-        return -1; // wglGetProcAddress failed to get wglGetExtensionsStringEXT
+        return std::numeric_limits<double>::quiet_NaN(); // wglGetProcAddress failed to get wglGetExtensionsStringEXT
 
     const char* wgl_extensions = wglGetExtensionsString();
     if (wgl_extensions == NULL)
-        return -1; // Can't get WGL extensions string
+        return std::numeric_limits<double>::quiet_NaN(); // Can't get WGL extensions string
 
     if (strstr(wgl_extensions, "WGL_EXT_swap_control") == NULL)
-        return -1; // WGL extensions don't contain WGL_EXT_swap_control
+        return std::numeric_limits<double>::quiet_NaN(); // WGL extensions don't contain WGL_EXT_swap_control
 
     typedef int (APIENTRY* PFNWGLGETSWAPINTERVALPROC)(void);
     PFNWGLGETSWAPINTERVALPROC wglGetSwapInterval = 0;
     wglGetSwapInterval = (PFNWGLGETSWAPINTERVALPROC)wglGetProcAddress("wglGetSwapIntervalEXT");
     if (wglGetSwapInterval == NULL)
-        return -1; // wglGetProcAddress failed to get wglGetSwapIntervalEXT
+        return std::numeric_limits<double>::quiet_NaN(); // wglGetProcAddress failed to get wglGetSwapIntervalEXT
 
     return wglGetSwapInterval();
 #endif
 }
 
+static bool setPropVsync_(CvWindow& window, bool enable_vsync);
+
 void cvSetPropVsync_W32(const char* name, const bool enable_vsync)
 {
 #ifndef HAVE_OPENGL
@@ -649,11 +747,22 @@ void cvSetPropVsync_W32(const char* name, const bool enable_vsync)
     if (!name)
         CV_Error(Error::StsNullPtr, "'name' argument must not be NULL");
 
-    CvWindow* window = icvFindWindowByName(name);
+    auto window = icvFindWindowByName(name);
     if (!window)
         CV_Error_(Error::StsBadArg, ("there is no window named '%s'", name));
 
-    if (!wglMakeCurrent(window->dc, window->hGLRC))
+    (void)setPropVsync_(*window, enable_vsync);
+#endif
+}
+
+static bool setPropVsync_(CvWindow& window, bool enable_vsync)
+{
+#ifndef HAVE_OPENGL
+    CV_UNUSED(window);
+    CV_UNUSED(enable_vsync);
+    CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
+#else
+    if (!wglMakeCurrent(window.dc, window.hGLRC))
         CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 
     typedef const char* (APIENTRY* PFNWGLGETEXTENSIONSSTRINGEXTPROC)(void);
@@ -676,47 +785,44 @@ void cvSetPropVsync_W32(const char* name, const bool enable_vsync)
         CV_Error(Error::OpenGlApiCallError, "wglGetProcAddress failed to get wglSwapIntervalEXT");
 
     wglSwapInterval(enable_vsync);
+    return true;
 #endif
 }
 
-void cv::setWindowTitle(const String& winname, const String& title)
+void setWindowTitle_W32(const std::string& name, const std::string& title)
 {
-    CvWindow* window = icvFindWindowByName(winname.c_str());
+    auto window = icvFindWindowByName(name);
 
     if (!window)
     {
-        namedWindow(winname);
-        window = icvFindWindowByName(winname.c_str());
+        namedWindow(name);
+        window = icvFindWindowByName(name);
     }
 
     if (!window)
         CV_Error(Error::StsNullPtr, "NULL window");
 
     if (!SetWindowText(window->frame, title.c_str()))
-        CV_Error_(Error::StsError, ("Failed to set \"%s\" window title to \"%s\"", winname.c_str(), title.c_str()));
+        CV_Error_(Error::StsError, ("Failed to set \"%s\" window title to \"%s\"", name.c_str(), title.c_str()));
 }
 
 double cvGetPropWindowAutoSize_W32(const char* name)
 {
     double result = -1;
 
-    CV_FUNCNAME( "cvSetCloseCallback" );
+    CV_FUNCNAME("cvSetCloseCallback");
 
-    __BEGIN__;
-
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT; // keep silence here
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     result = window->flags & CV_WINDOW_AUTOSIZE;
 
-    __END__;
-
     return result;
 }
 
@@ -724,23 +830,19 @@ double cvGetRatioWindow_W32(const char* name)
 {
     double result = -1;
 
-    CV_FUNCNAME( "cvGetRatioWindow_W32" );
+    CV_FUNCNAME("cvGetRatioWindow_W32");
 
-    __BEGIN__;
-
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT; // keep silence here
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     result = static_cast<double>(window->width) / window->height;
 
-    __END__;
-
     return result;
 }
 
@@ -749,23 +851,20 @@ double cvGetOpenGlProp_W32(const char* name)
     double result = -1;
 
 #ifdef HAVE_OPENGL
-    CV_FUNCNAME( "cvGetOpenGlProp_W32" );
+    CV_FUNCNAME("cvGetOpenGlProp_W32");
 
-    __BEGIN__;
-
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT; // keep silence here
+        return -1;
 
     result = window->useGl;
-
-    __END__;
 #endif
+
     CV_UNUSED(name);
 
     return result;
@@ -775,16 +874,15 @@ double cvGetPropVisible_W32(const char* name)
 {
     double result = -1;
 
-    CV_FUNCNAME( "cvGetPropVisible_W32" );
+    CV_FUNCNAME("cvGetPropVisible_W32");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
-
-    result = (icvFindWindowByName( name ) != NULL);
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    __END__;
+    auto window = icvFindWindowByName(name);
+    result = (bool)window ? 1.0 : 0.0;
 
     return result;
 }
@@ -798,9 +896,9 @@ namespace
 {
     void createGlContext(HWND hWnd, HDC& hGLDC, HGLRC& hGLRC, bool& useGl)
     {
-        CV_FUNCNAME( "createGlContext" );
+        CV_FUNCNAME("createGlContext");
 
-        __BEGIN__;
+        AutoLock lock(getWindowMutex());
 
         useGl = false;
 
@@ -830,120 +928,119 @@ namespace
 
         hGLDC = GetDC(hWnd);
         if (!hGLDC)
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Create A GL Device Context" );
+            CV_Error(Error::OpenGlApiCallError, "Can't Create A GL Device Context");
 
         PixelFormat = ChoosePixelFormat(hGLDC, &pfd);
         if (!PixelFormat)
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Find A Suitable PixelFormat" );
+            CV_Error(Error::OpenGlApiCallError, "Can't Find A Suitable PixelFormat");
 
         if (!SetPixelFormat(hGLDC, PixelFormat, &pfd))
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Set The PixelFormat" );
+            CV_Error(Error::OpenGlApiCallError, "Can't Set The PixelFormat");
 
         hGLRC = wglCreateContext(hGLDC);
         if (!hGLRC)
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Create A GL Rendering Context" );
+            CV_Error(Error::OpenGlApiCallError, "Can't Create A GL Rendering Context");
 
         if (!wglMakeCurrent(hGLDC, hGLRC))
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
+            CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 
         useGl = true;
-
-        __END__;
     }
 
-    void releaseGlContext(CvWindow* window)
+    void releaseGlContext(CvWindow& window)
     {
-        //CV_FUNCNAME( "releaseGlContext" );
+        //CV_FUNCNAME("releaseGlContext");
 
-        __BEGIN__;
+        AutoLock lock(getWindowMutex());
 
-        if (window->hGLRC)
+        if (window.hGLRC)
         {
-            wglDeleteContext(window->hGLRC);
-            window->hGLRC = NULL;
+            wglDeleteContext(window.hGLRC);
+            window.hGLRC = NULL;
         }
 
-        if (window->dc)
+        if (window.dc)
         {
-            ReleaseDC(window->hwnd, window->dc);
-            window->dc = NULL;
+            ReleaseDC(window.hwnd, window.dc);
+            window.dc = NULL;
         }
 
-        window->useGl = false;
-
-        __END__;
+        window.useGl = false;
     }
 
-    void drawGl(CvWindow* window)
+    void drawGl(CvWindow& window)
     {
-        CV_FUNCNAME( "drawGl" );
+        CV_FUNCNAME("drawGl");
 
-        __BEGIN__;
+        AutoLock lock(getWindowMutex());
 
-        if (!wglMakeCurrent(window->dc, window->hGLRC))
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
+        if (!wglMakeCurrent(window.dc, window.hGLRC))
+            CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 
         glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-        if (window->glDrawCallback)
-            window->glDrawCallback(window->glDrawData);
-
-        if (!SwapBuffers(window->dc))
-            CV_ERROR( CV_OpenGlApiCallError, "Can't swap OpenGL buffers" );
+        if (window.glDrawCallback)
+            window.glDrawCallback(window.glDrawData);
 
-        __END__;
+        if (!SwapBuffers(window.dc))
+            CV_Error(Error::OpenGlApiCallError, "Can't swap OpenGL buffers");
     }
 
-    void resizeGl(CvWindow* window)
+    void resizeGl(CvWindow& window)
     {
-        CV_FUNCNAME( "resizeGl" );
+        CV_FUNCNAME("resizeGl");
 
-        __BEGIN__;
+        AutoLock lock(getWindowMutex());
 
-        if (!wglMakeCurrent(window->dc, window->hGLRC))
-            CV_ERROR( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
+        if (!wglMakeCurrent(window.dc, window.hGLRC))
+            CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 
-        glViewport(0, 0, window->width, window->height);
-
-        __END__;
+        glViewport(0, 0, window.width, window.height);
     }
 }
 
 #endif // HAVE_OPENGL
 
+static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags);
+
+CV_IMPL int cvNamedWindow(const char* name, int flags)
+{
+    CV_FUNCNAME("cvNamedWindow");
+
+    AutoLock lock(getWindowMutex());
+
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name string");
+
+    // Check the name in the storage
+    auto window = icvFindWindowByName(name);
+    if (window)
+    {
+        return 1;
+    }
+
+    window = namedWindow_(name, flags);
+    return (bool)window;
+}
 
-CV_IMPL int cvNamedWindow( const char* name, int flags )
+static std::shared_ptr<CvWindow> namedWindow_(const std::string& name, int flags)
 {
-    int result = 0;
-    CV_FUNCNAME( "cvNamedWindow" );
+    AutoLock lock(getWindowMutex());
 
-    __BEGIN__;
+    cvInitSystem(0,0);
 
     HWND hWnd, mainhWnd;
-    CvWindow* window;
     DWORD defStyle = WS_VISIBLE | WS_MINIMIZEBOX | WS_MAXIMIZEBOX | WS_SYSMENU;
-    int len;
-    CvRect rect;
 #ifdef HAVE_OPENGL
     bool useGl;
     HDC hGLDC;
     HGLRC hGLRC;
 #endif
 
-    cvInitSystem(0,0);
-
-    if( !name )
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
-
-    // Check the name in the storage
-    window = icvFindWindowByName( name );
-    if (window != 0)
-    {
-        result = 1;
-        EXIT;
-    }
+    CvRect rect;
+    icvLoadWindowPos(name.c_str(), rect);
 
-    if( !(flags & CV_WINDOW_AUTOSIZE))//YV add border in order to resize the window
+    if (!(flags & CV_WINDOW_AUTOSIZE))//YV add border in order to resize the window
        defStyle |= WS_SIZEBOX;
 
 #ifdef HAVE_OPENGL
@@ -951,23 +1048,21 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
         defStyle |= WS_CLIPCHILDREN | WS_CLIPSIBLINGS;
 #endif
 
-    icvLoadWindowPos( name, rect );
-
-    mainhWnd = CreateWindow( "Main HighGUI class", name, defStyle | WS_OVERLAPPED,
-                             rect.x, rect.y, rect.width, rect.height, 0, 0, hg_hinstance, 0 );
-    if( !mainhWnd )
-        CV_ERROR( CV_StsError, "Frame window can not be created" );
+    mainhWnd = CreateWindow(mainHighGUIclassName, name.c_str(), defStyle | WS_OVERLAPPED,
+                             rect.x, rect.y, rect.width, rect.height, 0, 0, hg_hinstance, 0);
+    if (!mainhWnd)
+        CV_Error_(Error::StsError, ("Frame window can not be created: '%s'", name.c_str()));
 
     ShowWindow(mainhWnd, SW_SHOW);
 
     //YV- remove one border by changing the style
-    hWnd = CreateWindow("HighGUI class", "", (defStyle & ~WS_SIZEBOX) | WS_CHILD, CW_USEDEFAULT, 0, rect.width, rect.height, mainhWnd, 0, hg_hinstance, 0);
-    if( !hWnd )
-        CV_ERROR( CV_StsError, "Frame window can not be created" );
+    hWnd = CreateWindow(highGUIclassName, "", (defStyle & ~WS_SIZEBOX) | WS_CHILD, CW_USEDEFAULT, 0, rect.width, rect.height, mainhWnd, 0, hg_hinstance, 0);
+    if (!hWnd)
+        CV_Error(Error::StsError, "Frame window can not be created");
 
 #ifndef HAVE_OPENGL
     if (flags & CV_WINDOW_OPENGL)
-        CV_ERROR( CV_OpenGlNotSupported, "Library was built without OpenGL support" );
+        CV_Error(Error::OpenGlNotSupported, "Library was built without OpenGL support");
 #else
     useGl = false;
     hGLDC = 0;
@@ -979,14 +1074,10 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
 
     ShowWindow(hWnd, SW_SHOW);
 
-    len = (int)strlen(name);
-    CV_CALL( window = (CvWindow*)cvAlloc(sizeof(CvWindow) + len + 1));
+    auto window = std::make_shared<CvWindow>(name);
 
-    window->signature = CV_WINDOW_MAGIC_VAL;
     window->hwnd = hWnd;
     window->frame = mainhWnd;
-    window->name = (char*)(window + 1);
-    memcpy( window->name, name, len + 1 );
     window->flags = flags;
     window->image = 0;
 
@@ -1016,200 +1107,175 @@ CV_IMPL int cvNamedWindow( const char* name, int flags )
     window->on_mouse = 0;
     window->on_mouse_param = 0;
 
-    memset( &window->toolbar, 0, sizeof(window->toolbar));
+    icvSetWindowLongPtr(hWnd, CV_USERDATA, window.get());
+    icvSetWindowLongPtr(mainhWnd, CV_USERDATA, window.get());
 
-    window->next = hg_windows;
-    window->prev = 0;
-    if( hg_windows )
-        hg_windows->prev = window;
-    hg_windows = window;
-    icvSetWindowLongPtr( hWnd, CV_USERDATA, window );
-    icvSetWindowLongPtr( mainhWnd, CV_USERDATA, window );
+    auto& g_windows = getWindowsList();
+    g_windows.push_back(window);
 
     // Recalculate window pos
-    icvUpdateWindowPos( window );
+    icvUpdateWindowPos(*window);
 
-    result = 1;
-    __END__;
-
-    return result;
+    return window;
 }
 
 #ifdef HAVE_OPENGL
 
 CV_IMPL void cvSetOpenGlContext(const char* name)
 {
-    CV_FUNCNAME( "cvSetOpenGlContext" );
-
-    __BEGIN__;
+    CV_FUNCNAME("cvSetOpenGlContext");
 
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
-    if(!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        CV_ERROR( CV_StsNullPtr, "NULL window" );
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     if (!window->useGl)
-        CV_ERROR( CV_OpenGlNotSupported, "Window doesn't support OpenGL" );
+        CV_Error(Error::OpenGlNotSupported, "Window doesn't support OpenGL");
 
     if (!wglMakeCurrent(window->dc, window->hGLRC))
-        CV_ERROR( CV_OpenGlApiCallError, "Can't Activate The GL Rendering Context" );
-
-    __END__;
+        CV_Error(Error::OpenGlApiCallError, "Can't Activate The GL Rendering Context");
 }
 
 CV_IMPL void cvUpdateWindow(const char* name)
 {
-    CV_FUNCNAME( "cvUpdateWindow" );
-
-    __BEGIN__;
+    CV_FUNCNAME("cvUpdateWindow");
 
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
     if (!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
+    auto window = icvFindWindowByName(name);
     if (!window)
-        EXIT;
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     InvalidateRect(window->hwnd, 0, 0);
-
-    __END__;
 }
 
 CV_IMPL void cvSetOpenGlDrawCallback(const char* name, CvOpenGlDrawCallback callback, void* userdata)
 {
-    CV_FUNCNAME( "cvCreateOpenGLCallback" );
-
-    __BEGIN__;
+    CV_FUNCNAME("cvCreateOpenGLCallback");
 
-    CvWindow* window;
+    AutoLock lock(getWindowMutex());
 
-    if(!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    window = icvFindWindowByName( name );
-    if( !window )
-        EXIT;
+    auto window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     if (!window->useGl)
-        CV_ERROR( CV_OpenGlNotSupported, "Window was created without OpenGL context" );
+        CV_Error(Error::OpenGlNotSupported, "Window was created without OpenGL context");
 
     window->glDrawCallback = callback;
     window->glDrawData = userdata;
-
-    __END__;
 }
 
 #endif // HAVE_OPENGL
 
-static void icvRemoveWindow( CvWindow* window )
+static void icvRemoveWindow(const std::shared_ptr<CvWindow>& window_)
 {
-    CvTrackbar* trackbar = NULL;
+    CV_Assert(window_);
+    AutoLock lock(getWindowMutex());
+    CvWindow& window = *window_;
+
     RECT wrect={0,0,0,0};
 
+    auto& g_windows = getWindowsList();
+    for (auto it = g_windows.begin(); it != g_windows.end(); ++it)
+    {
+        const std::shared_ptr<CvWindow>& w = *it;
+        if (w.get() == &window)
+        {
+            g_windows.erase(it);
+            break;
+        }
+    }
+
 #ifdef HAVE_OPENGL
-    if (window->useGl)
+    if (window.useGl)
         releaseGlContext(window);
 #endif
 
-    if( window->frame )
-        GetWindowRect( window->frame, &wrect );
-    if( window->name )
-        icvSaveWindowPos( window->name, cvRect(wrect.left, wrect.top,
-            wrect.right-wrect.left, wrect.bottom-wrect.top) );
-
-    if( window->hwnd )
-        icvSetWindowLongPtr( window->hwnd, CV_USERDATA, 0 );
-    if( window->frame )
-        icvSetWindowLongPtr( window->frame, CV_USERDATA, 0 );
-
-    if( window->toolbar.toolbar )
-        icvSetWindowLongPtr(window->toolbar.toolbar, CV_USERDATA, 0);
-
-    if( window->prev )
-        window->prev->next = window->next;
-    else
-        hg_windows = window->next;
+    if (window.frame)
+        GetWindowRect(window.frame, &wrect);
+    icvSaveWindowPos(window.name.c_str(), cvRect(wrect.left, wrect.top, wrect.right-wrect.left, wrect.bottom-wrect.top));
 
-    if( window->next )
-        window->next->prev = window->prev;
+    if (window.hwnd)
+        icvSetWindowLongPtr(window.hwnd, CV_USERDATA, 0);
+    if (window.frame)
+        icvSetWindowLongPtr(window.frame, CV_USERDATA, 0);
 
-    window->prev = window->next = 0;
+    if (window.toolbar.toolbar)
+        icvSetWindowLongPtr(window.toolbar.toolbar, CV_USERDATA, 0);
 
-    if( window->dc && window->image )
-        DeleteObject(SelectObject(window->dc,window->image));
+    if (window.dc && window.image)
+        DeleteObject(SelectObject(window.dc, window.image));
 
-    if( window->dc )
-        DeleteDC(window->dc);
+    if (window.dc)
+        DeleteDC(window.dc);
 
-    for( trackbar = window->toolbar.first; trackbar != 0; )
+    for (auto it = window.toolbar.trackbars.begin(); it != window.toolbar.trackbars.end(); ++it)
     {
-        CvTrackbar* next = trackbar->next;
-        if( trackbar->hwnd )
+        auto trackbar = (*it).get();
+        if (trackbar && trackbar->hwnd)
         {
-            icvSetWindowLongPtr( trackbar->hwnd, CV_USERDATA, 0 );
-            cvFree( &trackbar );
+            icvSetWindowLongPtr(trackbar->hwnd, CV_USERDATA, 0);
         }
-        trackbar = next;
     }
-
-    cvFree( &window );
 }
 
 
-CV_IMPL void cvDestroyWindow( const char* name )
+CV_IMPL void cvDestroyWindow(const char* name)
 {
-    CV_FUNCNAME( "cvDestroyWindow" );
+    CV_FUNCNAME("cvDestroyWindow");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
-    CvWindow* window;
-    HWND mainhWnd;
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name string");
 
-    if(!name)
-        CV_ERROR( CV_StsNullPtr, "NULL name string" );
+    auto window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    window = icvFindWindowByName( name );
-    if( !window )
-        EXIT;
+    window->destroy();
+}
 
-    mainhWnd = window->frame;
 
-    SendMessage(window->hwnd, WM_CLOSE, 0, 0);
-    SendMessage( mainhWnd, WM_CLOSE, 0, 0);
+void CvWindow::destroy()
+{
+    SendMessage(hwnd, WM_CLOSE, 0, 0);
+    SendMessage(frame, WM_CLOSE, 0, 0);
     // Do NOT call _remove_window -- CvWindow list will be updated automatically ...
-
-    __END__;
 }
 
-
-static void icvScreenToClient( HWND hwnd, RECT* rect )
+static void icvScreenToClient(HWND hwnd, RECT* rect)
 {
     POINT p;
     p.x = rect->left;
     p.y = rect->top;
     ScreenToClient(hwnd, &p);
-    OffsetRect( rect, p.x - rect->left, p.y - rect->top );
+    OffsetRect(rect, p.x - rect->left, p.y - rect->top);
 }
 
 
 /* Calculatess the window coordinates relative to the upper left corner of the mainhWnd window */
-static RECT icvCalcWindowRect( CvWindow* window )
+static RECT icvCalcWindowRect(CvWindow& window)
 {
     RECT crect = { 0 }, trect = { 0 }, rect = { 0 };
 
-    assert(window);
-
-    GetClientRect(window->frame, &crect);
-    if (window->toolbar.toolbar)
+    GetClientRect(window.frame, &crect);
+    if (window.toolbar.toolbar)
     {
-        GetWindowRect(window->toolbar.toolbar, &trect);
-        icvScreenToClient(window->frame, &trect);
+        GetWindowRect(window.toolbar.toolbar, &trect);
+        icvScreenToClient(window.frame, &trect);
         SubtractRect(&rect, &crect, &trect);
     }
     else
@@ -1217,138 +1283,153 @@ static RECT icvCalcWindowRect( CvWindow* window )
 
     return rect;
 }
+static inline RECT icvCalcWindowRect(CvWindow* window) { CV_Assert(window); return icvCalcWindowRect(*window); }
+
 
-// returns TRUE if there is a problem such as ERROR_IO_PENDING.
-static bool icvGetBitmapData( CvWindow* window, SIZE* size, int* channels, void** data )
+// returns FALSE if there is a problem such as ERROR_IO_PENDING.
+static bool icvGetBitmapData(CvWindow& window, SIZE& size, int& channels, void*& data)
 {
-    BITMAP bmp;
     GdiFlush();
-    HGDIOBJ h = GetCurrentObject( window->dc, OBJ_BITMAP );
-    if( size )
-        size->cx = size->cy = 0;
-    if( data )
-        *data = 0;
+
+    HGDIOBJ h = GetCurrentObject(window.dc, OBJ_BITMAP);
+    size.cx = size.cy = 0;
+    data = 0;
 
     if (h == NULL)
-        return true;
+        return false;
+
+    BITMAP bmp = {};
     if (GetObject(h, sizeof(bmp), &bmp) == 0)
-        return true;
+        return false;
 
-    if( size )
-    {
-        size->cx = abs(bmp.bmWidth);
-        size->cy = abs(bmp.bmHeight);
-    }
+    size.cx = abs(bmp.bmWidth);
+    size.cy = abs(bmp.bmHeight);
 
-    if( channels )
-        *channels = bmp.bmBitsPixel/8;
+    channels = bmp.bmBitsPixel/8;
 
-    if( data )
-        *data = bmp.bmBits;
+    data = bmp.bmBits;
 
-    return false;
+    return true;
+}
+static bool icvGetBitmapData(CvWindow& window, SIZE& size)
+{
+    int channels = 0;
+    void* data = nullptr;
+    return icvGetBitmapData(window, size, channels, data);
 }
 
 
-static void icvUpdateWindowPos( CvWindow* window )
+static void icvUpdateWindowPos(CvWindow& window)
 {
     RECT rect = { 0 };
-    assert(window);
 
-    if( (window->flags & CV_WINDOW_AUTOSIZE) && window->image )
+    if ((window.flags & CV_WINDOW_AUTOSIZE) && window.image)
     {
         int i;
         SIZE size = {0,0};
-        icvGetBitmapData( window, &size, 0, 0 );
+        icvGetBitmapData(window, size);  // TODO check return value?
 
         // Repeat two times because after the first resizing of the mainhWnd window
         // toolbar may resize too
-        for(i = 0; i < (window->toolbar.toolbar ? 2 : 1); i++)
+        for(i = 0; i < (window.toolbar.toolbar ? 2 : 1); i++)
         {
-            RECT rmw = { 0 }, rw = icvCalcWindowRect(window );
-            MoveWindow(window->hwnd, rw.left, rw.top,
+            RECT rmw = { 0 }, rw = icvCalcWindowRect(&window);
+            MoveWindow(window.hwnd, rw.left, rw.top,
                 rw.right - rw.left, rw.bottom - rw.top, FALSE);
-            GetClientRect(window->hwnd, &rw);
-            GetWindowRect(window->frame, &rmw);
+            GetClientRect(window.hwnd, &rw);
+            GetWindowRect(window.frame, &rmw);
             // Resize the mainhWnd window in order to make the bitmap fit into the child window
-            MoveWindow(window->frame, rmw.left, rmw.top,
+            MoveWindow(window.frame, rmw.left, rmw.top,
                 size.cx + (rmw.right - rmw.left) - (rw.right - rw.left),
-                size.cy + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE );
+                size.cy + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE);
         }
     }
 
     rect = icvCalcWindowRect(window);
-    MoveWindow(window->hwnd, rect.left, rect.top,
+    MoveWindow(window.hwnd, rect.left, rect.top,
                rect.right - rect.left,
-               rect.bottom - rect.top, TRUE );
+               rect.bottom - rect.top, TRUE);
 }
 
+static void showImage_(CvWindow& window, const Mat& image);
+
 CV_IMPL void
-cvShowImage( const char* name, const CvArr* arr )
+cvShowImage(const char* name, const CvArr* arr)
 {
-    CV_FUNCNAME( "cvShowImage" );
-
-    __BEGIN__;
-
-    CvWindow* window;
-    SIZE size = { 0, 0 };
-    int channels = 0;
-    void* dst_ptr = 0;
-    const int channels0 = 3;
-    CvMat stub, *image;
-    bool changed_size = false; // philipg
+    CV_FUNCNAME("cvShowImage");
 
-    if( !name )
-        CV_ERROR( CV_StsNullPtr, "NULL name" );
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name");
 
-    window = icvFindWindowByName(name);
-    if(!window)
+    std::shared_ptr<CvWindow> window;
     {
-        cvNamedWindow(name, CV_WINDOW_AUTOSIZE);
+        AutoLock lock(getWindowMutex());
+
         window = icvFindWindowByName(name);
+        if (!window)
+        {
+            cvNamedWindow(name, CV_WINDOW_AUTOSIZE);
+            window = icvFindWindowByName(name);
+        }
     }
 
-    if( !window || !arr )
-        EXIT; // keep silence here.
-
-    CV_CALL( image = cvGetMat( arr, &stub ));
+    if (!window || !arr)
+        return; // keep silence here.
 
+    CvMat stub = {};
+    CvMat* image_c = cvGetMat(arr, &stub);
+    Mat image = cv::cvarrToMat(image_c);
 #ifdef HAVE_OPENGL
     if (window->useGl)
     {
-        cv::imshow(name, cv::cvarrToMat(image));
+        cv::imshow(name, image);
         return;
     }
 #endif
+    return showImage_(*window, image);
+}
+
+static void showImage_(CvWindow& window, const Mat& image)
+{
+    AutoLock lock(window.mutex);
+
+    SIZE size = { 0, 0 };
+    int channels = 0;
+    void* dst_ptr = 0;
+    const int channels0 = 3;
+    bool changed_size = false; // philipg
 
-    if (window->image)
+    if (window.image)
+    {
         // if there is something wrong with these system calls, we cannot display image...
-        if (icvGetBitmapData( window, &size, &channels, &dst_ptr ))
+        if (!icvGetBitmapData(window, size, channels, dst_ptr))
             return;
+    }
 
-    if( size.cx != image->width || size.cy != image->height || channels != channels0 )
+    if (size.cx != image.cols || size.cy != image.rows || channels != channels0)
     {
         changed_size = true;
 
         uchar buffer[sizeof(BITMAPINFO) + 255*sizeof(RGBQUAD)];
         BITMAPINFO* binfo = (BITMAPINFO*)buffer;
 
-        DeleteObject( SelectObject( window->dc, window->image ));
-        window->image = 0;
+        DeleteObject(SelectObject(window.dc, window.image));
+        window.image = 0;
 
-        size.cx = image->width;
-        size.cy = image->height;
+        size.cx = image.cols;
+        size.cy = image.rows;
         channels = channels0;
 
-        FillBitmapInfo( binfo, size.cx, size.cy, channels*8, 1 );
+        FillBitmapInfo(binfo, size.cx, size.cy, channels*8, 1);
 
-        window->image = SelectObject( window->dc, CreateDIBSection(window->dc, binfo,
-                                      DIB_RGB_COLORS, &dst_ptr, 0, 0));
+        window.image = SelectObject(window.dc,
+                CreateDIBSection(window.dc, binfo, DIB_RGB_COLORS, &dst_ptr, 0, 0)
+            );
     }
 
     {
         cv::Mat dst(size.cy, size.cx, CV_8UC3, dst_ptr, (size.cx * channels + 3) & -4);
-        convertToShow(cv::cvarrToMat(image), dst, false);
+        convertToShow(image, dst, false);
         CV_Assert(dst.data == (uchar*)dst_ptr);
         cv::flip(dst, dst, 0);
     }
@@ -1356,98 +1437,103 @@ cvShowImage( const char* name, const CvArr* arr )
     // only resize window if needed
     if (changed_size)
         icvUpdateWindowPos(window);
-    InvalidateRect(window->hwnd, 0, 0);
+    InvalidateRect(window.hwnd, 0, 0);
     // philipg: this is not needed and just slows things down
     //    UpdateWindow(window->hwnd);
-
-    __END__;
 }
 
-CV_IMPL void cvResizeWindow(const char* name, int width, int height )
+static void resizeWindow_(CvWindow& window, const Size& size);
+
+CV_IMPL void cvResizeWindow(const char* name, int width, int height)
 {
-    CV_FUNCNAME( "cvResizeWindow" );
+    CV_FUNCNAME("cvResizeWindow");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
-    int i;
-    CvWindow* window;
-    RECT rmw = { 0 }, rw = { 0 }, rect = { 0 };
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name");
 
-    if( !name )
-        CV_ERROR( CV_StsNullPtr, "NULL name" );
+    auto window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    window = icvFindWindowByName(name);
-    if(!window)
-        EXIT;
+    return resizeWindow_(*window, Size(width, height));
+}
+
+static void resizeWindow_(CvWindow& window, const Size& size)
+{
+    RECT rmw = { 0 }, rw = { 0 }, rect = { 0 };
 
     // Repeat two times because after the first resizing of the mainhWnd window
     // toolbar may resize too
-    for(i = 0; i < (window->toolbar.toolbar ? 2 : 1); i++)
+    for (int i = 0; i < (window.toolbar.toolbar ? 2 : 1); i++)
     {
         rw = icvCalcWindowRect(window);
-        MoveWindow(window->hwnd, rw.left, rw.top,
+        MoveWindow(window.hwnd, rw.left, rw.top,
             rw.right - rw.left, rw.bottom - rw.top, FALSE);
-        GetClientRect(window->hwnd, &rw);
-        GetWindowRect(window->frame, &rmw);
+        GetClientRect(window.hwnd, &rw);
+        GetWindowRect(window.frame, &rmw);
         // Resize the mainhWnd window in order to make the bitmap fit into the child window
-        MoveWindow(window->frame, rmw.left, rmw.top,
-            width  + (rmw.right - rmw.left) - (rw.right - rw.left),
-            height + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE);
+        MoveWindow(window.frame, rmw.left, rmw.top,
+            size.width  + (rmw.right - rmw.left) - (rw.right - rw.left),
+            size.height + (rmw.bottom - rmw.top) - (rw.bottom - rw.top), TRUE);
     }
 
     rect = icvCalcWindowRect(window);
-    MoveWindow(window->hwnd, rect.left, rect.top,
+    MoveWindow(window.hwnd, rect.left, rect.top,
         rect.right - rect.left, rect.bottom - rect.top, TRUE);
-
-    __END__;
 }
 
+static void moveWindow_(CvWindow& window, const Point& pt);
 
-CV_IMPL void cvMoveWindow( const char* name, int x, int y )
+CV_IMPL void cvMoveWindow(const char* name, int x, int y)
 {
-    CV_FUNCNAME( "cvMoveWindow" );
+    CV_FUNCNAME("cvMoveWindow");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
-    CvWindow* window;
-    RECT rect = { 0 };
-
-    if( !name )
-        CV_ERROR( CV_StsNullPtr, "NULL name" );
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL name");
 
-    window = icvFindWindowByName(name);
-    if(!window)
-        EXIT;
+    auto window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
-    GetWindowRect( window->frame, &rect );
-    MoveWindow( window->frame, x, y, rect.right - rect.left, rect.bottom - rect.top, TRUE);
+    (void)moveWindow_(*window, Point(x, y));
+}
 
-    __END__;
+static void moveWindow_(CvWindow& window, const Point& pt)
+{
+    RECT rect = { 0 };
+    GetWindowRect(window.frame, &rect);  // TODO check return value
+    MoveWindow(window.frame, pt.x, pt.y, rect.right - rect.left, rect.bottom - rect.top, TRUE);
 }
 
 
 static LRESULT CALLBACK
-MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
+MainWindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
-    CvWindow* window = icvWindowByHWND( hwnd );
-    if( !window )
+    auto window_ = icvWindowByHWND(hwnd);
+    if (!window_)
         return DefWindowProc(hwnd, uMsg, wParam, lParam);
 
+    CvWindow& window = *window_;
+
     switch(uMsg)
     {
     case WM_COPY:
-        ::SendMessage(window->hwnd, uMsg, wParam, lParam);
+        ::SendMessage(window.hwnd, uMsg, wParam, lParam);
         break;
 
     case WM_DESTROY:
 
-        icvRemoveWindow(window);
+        icvRemoveWindow(window_);
         // Do nothing!!!
         //PostQuitMessage(0);
         break;
 
     case WM_GETMINMAXINFO:
-        if( !(window->flags & CV_WINDOW_AUTOSIZE) )
+        if (!(window.flags & CV_WINDOW_AUTOSIZE))
         {
             MINMAXINFO* minmax = (MINMAXINFO*)lParam;
             RECT rect = { 0 };
@@ -1456,10 +1542,10 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
             minmax->ptMinTrackSize.y = 100;
             minmax->ptMinTrackSize.x = 100;
 
-            if( window->toolbar.first )
+            if (!window.toolbar.trackbars.empty())
             {
-                GetWindowRect( window->toolbar.first->hwnd, &rect );
-                minmax->ptMinTrackSize.y += window->toolbar.rows*(rect.bottom - rect.top);
+                GetWindowRect(window.toolbar.trackbars[0]->hwnd, &rect);
+                minmax->ptMinTrackSize.y += window.toolbar.rows*(rect.bottom - rect.top);
                 minmax->ptMinTrackSize.x = MAX(rect.right - rect.left + HG_BUDDY_WIDTH, HG_BUDDY_WIDTH*2);
             }
             return retval;
@@ -1471,14 +1557,14 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
             WINDOWPOS* pos = (WINDOWPOS*)lParam;
 
             // Update the toolbar pos/size
-            if(window->toolbar.toolbar)
+            if (window.toolbar.toolbar)
             {
                 RECT rect = { 0 };
-                GetWindowRect(window->toolbar.toolbar, &rect);
-                MoveWindow(window->toolbar.toolbar, 0, 0, pos->cx, rect.bottom - rect.top, TRUE);
+                GetWindowRect(window.toolbar.toolbar, &rect);
+                MoveWindow(window.toolbar.toolbar, 0, 0, pos->cx, rect.bottom - rect.top, TRUE);
             }
 
-            if(!(window->flags & CV_WINDOW_AUTOSIZE))
+            if (!(window.flags & CV_WINDOW_AUTOSIZE))
                 icvUpdateWindowPos(window);
 
             break;
@@ -1490,7 +1576,7 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
           LPWINDOWPOS pos = (LPWINDOWPOS)lParam;
 
           RECT rect = { 0 };
-          GetWindowRect(window->frame, &rect);
+          GetWindowRect(window.frame, &rect);
 
           HMONITOR hMonitor;
           hMonitor = MonitorFromRect(&rect, MONITOR_DEFAULTTONEAREST);
@@ -1515,13 +1601,13 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
        }
 
     case WM_ACTIVATE:
-        if(LOWORD(wParam) == WA_ACTIVE || LOWORD(wParam) == WA_CLICKACTIVE)
-            SetFocus(window->hwnd);
+        if (LOWORD(wParam) == WA_ACTIVE || LOWORD(wParam) == WA_CLICKACTIVE)
+            SetFocus(window.hwnd);
         break;
 
     case WM_MOUSEWHEEL:
     case WM_MOUSEHWHEEL:
-       if( window->on_mouse )
+       if (window.on_mouse)
        {
           int flags = (wParam & MK_LBUTTON      ? CV_EVENT_FLAG_LBUTTON  : 0)|
                       (wParam & MK_RBUTTON      ? CV_EVENT_FLAG_RBUTTON  : 0)|
@@ -1536,32 +1622,32 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
           flags |= (delta << 16);
 
           POINT pt;
-          pt.x = GET_X_LPARAM( lParam );
-          pt.y = GET_Y_LPARAM( lParam );
+          pt.x = GET_X_LPARAM(lParam);
+          pt.y = GET_Y_LPARAM(lParam);
           ::ScreenToClient(hwnd, &pt); // Convert screen coordinates to client coordinates.
 
           RECT rect = { 0 };
-          GetClientRect( window->hwnd, &rect );
+          GetClientRect(window.hwnd, &rect);
 
           SIZE size = {0,0};
 #ifdef HAVE_OPENGL
-          if (window->useGl)
+          if (window.useGl)
           {
-              cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window->glDrawData);
+              cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window.glDrawData);
               size.cx = texObj->cols();
               size.cy = texObj->rows();
           }
           else
           {
-              icvGetBitmapData(window, &size, 0, 0);
+              icvGetBitmapData(window, size);
           }
 #else
-          icvGetBitmapData(window, &size, 0, 0);
+          icvGetBitmapData(window, size);
 #endif
 
-          window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1),
-                                   pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags,
-                                   window->on_mouse_param );
+          int x = cvRound((float)pt.x*size.cx/MAX(rect.right - rect.left,1));
+          int y = cvRound((float)pt.y*size.cy/MAX(rect.bottom - rect.top,1));
+          window.on_mouse(event, x, y, flags, window.on_mouse_param);
        }
        break;
 
@@ -1571,17 +1657,17 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
             HRGN rgn, rgn1, rgn2;
             int ret;
             HDC hdc = (HDC)wParam;
-            GetWindowRect(window->hwnd, &cr);
-            icvScreenToClient(window->frame, &cr);
-            if(window->toolbar.toolbar)
+            GetWindowRect(window.hwnd, &cr);
+            icvScreenToClient(window.frame, &cr);
+            if (window.toolbar.toolbar)
             {
-                GetWindowRect(window->toolbar.toolbar, &tr);
-                icvScreenToClient(window->frame, &tr);
+                GetWindowRect(window.toolbar.toolbar, &tr);
+                icvScreenToClient(window.frame, &tr);
             }
             else
                 tr.left = tr.top = tr.right = tr.bottom = 0;
 
-            GetClientRect(window->frame, &wrc);
+            GetClientRect(window.frame, &wrc);
 
             rgn = CreateRectRgn(0, 0, wrc.right, wrc.bottom);
             rgn1 = CreateRectRgn(cr.left, cr.top, cr.right, cr.bottom);
@@ -1591,7 +1677,7 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
             ret = CombineRgn(rgn, rgn, rgn1, RGN_DIFF);
             ret = CombineRgn(rgn, rgn, rgn2, RGN_DIFF);
 
-            if(ret != NULLREGION && ret != ERROR)
+            if (ret != NULLREGION && ret != ERROR)
                 FillRgn(hdc, rgn, (HBRUSH)icvGetClassLongPtr(hwnd, CV_HBRBACKGROUND));
 
             DeleteObject(rgn);
@@ -1605,20 +1691,24 @@ MainWindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
 }
 
 
-static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
+static LRESULT CALLBACK HighGUIProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
-    CvWindow* window = icvWindowByHWND(hwnd);
-    if( !window )
+    auto window_ = icvWindowByHWND(hwnd);
+    if (!window_)
+    {
         // This window is not mentioned in HighGUI storage
         // Actually, this should be error except for the case of calls to CreateWindow
         return DefWindowProc(hwnd, uMsg, wParam, lParam);
+    }
+
+    CvWindow& window = *window_;
 
     // Process the message
     switch(uMsg)
     {
     case WM_COPY:
         {
-            if (!::OpenClipboard(hwnd) )
+            if (!::OpenClipboard(hwnd))
                 break;
 
             HDC hDC       = 0;
@@ -1632,7 +1722,7 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                 if (!::EmptyClipboard())
                     break;
 
-                if(!window->image)
+                if (!window.image)
                     break;
 
                 // Get window device context
@@ -1640,19 +1730,20 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                     break;
 
                 // Create another DC compatible with hDC
-                if (0 == (memDC = ::CreateCompatibleDC( hDC )))
+                if (0 == (memDC = ::CreateCompatibleDC(hDC)))
                     break;
 
                 // Determine the bitmap's dimensions
-                int nchannels = 3;
                 SIZE size = {0,0};
-                icvGetBitmapData( window, &size, &nchannels, 0 );
+                int nchannels = 3;
+                void* data = NULL;  // unused
+                icvGetBitmapData(window, size, nchannels, data);
 
                 // Create bitmap to draw on and it in the new DC
-                if (0 == (memBM = ::CreateCompatibleBitmap ( hDC, size.cx, size.cy)))
+                if (0 == (memBM = ::CreateCompatibleBitmap(hDC, size.cx, size.cy)))
                     break;
 
-                if (!::SelectObject( memDC, memBM ))
+                if (!::SelectObject(memDC, memBM))
                     break;
 
                 // Begin drawing to DC
@@ -1660,7 +1751,7 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                     break;
 
                 RGBQUAD table[256];
-                if( 1 == nchannels )
+                if (1 == nchannels)
                 {
                     for(int i = 0; i < 256; ++i)
                     {
@@ -1668,14 +1759,14 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                         table[i].rgbGreen = (unsigned char)i;
                         table[i].rgbRed = (unsigned char)i;
                     }
-                    if (!::SetDIBColorTable(window->dc, 0, 255, table))
+                    if (!::SetDIBColorTable(window.dc, 0, 255, table))
                         break;
                 }
 
                 // The image copied to the clipboard will be in its original size, regardless if the window itself was resized.
 
                 // Render the image to the dc/bitmap (at original size).
-                if (!::BitBlt( memDC, 0, 0, size.cx, size.cy, window->dc, 0, 0, SRCCOPY ))
+                if (!::BitBlt(memDC, 0, 0, size.cx, size.cy, window.dc, 0, 0, SRCCOPY))
                     break;
 
                 // Finally, set bitmap to clipboard
@@ -1712,7 +1803,7 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
     case WM_RBUTTONUP:
     case WM_MBUTTONUP:
     case WM_MOUSEMOVE:
-        if( window->on_mouse )
+        if (window.on_mouse)
         {
             POINT pt;
 
@@ -1732,50 +1823,50 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                         uMsg == WM_RBUTTONDBLCLK ? CV_EVENT_RBUTTONDBLCLK :
                         uMsg == WM_MBUTTONDBLCLK ? CV_EVENT_MBUTTONDBLCLK :
                                                    CV_EVENT_MOUSEMOVE;
-            if( uMsg == WM_LBUTTONDOWN || uMsg == WM_RBUTTONDOWN || uMsg == WM_MBUTTONDOWN )
-                SetCapture( hwnd );
-            if( uMsg == WM_LBUTTONUP || uMsg == WM_RBUTTONUP || uMsg == WM_MBUTTONUP )
+            if (uMsg == WM_LBUTTONDOWN || uMsg == WM_RBUTTONDOWN || uMsg == WM_MBUTTONDOWN)
+                SetCapture(hwnd);
+            if (uMsg == WM_LBUTTONUP || uMsg == WM_RBUTTONUP || uMsg == WM_MBUTTONUP)
                 ReleaseCapture();
 
-            pt.x = GET_X_LPARAM( lParam );
-            pt.y = GET_Y_LPARAM( lParam );
+            pt.x = GET_X_LPARAM(lParam);
+            pt.y = GET_Y_LPARAM(lParam);
 
-            if (window->flags & CV_WINDOW_AUTOSIZE)
+            if (window.flags & CV_WINDOW_AUTOSIZE)
             {
                 // As user can't change window size, do not scale window coordinates. Underlying windowing system
                 // may prevent full window from being displayed and in this case coordinates should not be scaled.
-                window->on_mouse( event, pt.x, pt.y, flags, window->on_mouse_param );
+                window.on_mouse(event, pt.x, pt.y, flags, window.on_mouse_param);
             } else {
                 // Full window is displayed using different size. Scale coordinates to match underlying positions.
                 RECT rect = { 0 };
                 SIZE size = {0, 0};
 
-                GetClientRect( window->hwnd, &rect );
+                GetClientRect(window.hwnd, &rect);
 
 #ifdef HAVE_OPENGL
-                if (window->useGl)
+                if (window.useGl)
                 {
-                    cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window->glDrawData);
+                    cv::ogl::Texture2D* texObj = static_cast<cv::ogl::Texture2D*>(window.glDrawData);
                     size.cx = texObj->cols();
                     size.cy = texObj->rows();
                 }
                 else
                 {
-                    icvGetBitmapData(window, &size, 0, 0);
+                    icvGetBitmapData(window, size);
                 }
 #else
-                icvGetBitmapData( window, &size, 0, 0 );
+                icvGetBitmapData(window, size);
 #endif
 
-                window->on_mouse( event, pt.x*size.cx/MAX(rect.right - rect.left,1),
-                                         pt.y*size.cy/MAX(rect.bottom - rect.top,1), flags,
-                                         window->on_mouse_param );
+                int x = cvRound((float)pt.x*size.cx/MAX(rect.right - rect.left,1));
+                int y = cvRound((float)pt.y*size.cy/MAX(rect.bottom - rect.top,1));
+                window.on_mouse(event, x, y, flags, window.on_mouse_param);
             }
         }
         break;
 
     case WM_PAINT:
-        if(window->image != 0)
+        if (window.image != 0)
         {
             int nchannels = 3;
             SIZE size = {0,0};
@@ -1784,12 +1875,13 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
             RGBQUAD table[256];
 
             // Determine the bitmap's dimensions
-            icvGetBitmapData( window, &size, &nchannels, 0 );
+            void* data = 0;  // unused
+            icvGetBitmapData(window, size, nchannels, data);
 
             hdc = BeginPaint(hwnd, &paint);
             SetStretchBltMode(hdc, COLORONCOLOR);
 
-            if( nchannels == 1 )
+            if (nchannels == 1)
             {
                 int i;
                 for(i = 0; i < 256; i++)
@@ -1798,25 +1890,25 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
                     table[i].rgbGreen = (unsigned char)i;
                     table[i].rgbRed = (unsigned char)i;
                 }
-                SetDIBColorTable(window->dc, 0, 255, table);
+                SetDIBColorTable(window.dc, 0, 255, table);
             }
 
-            if(window->flags & CV_WINDOW_AUTOSIZE)
+            if (window.flags & CV_WINDOW_AUTOSIZE)
             {
-                BitBlt( hdc, 0, 0, size.cx, size.cy, window->dc, 0, 0, SRCCOPY );
+                BitBlt(hdc, 0, 0, size.cx, size.cy, window.dc, 0, 0, SRCCOPY);
             }
             else
             {
                 RECT rect = { 0 };
-                GetClientRect(window->hwnd, &rect);
-                StretchBlt( hdc, 0, 0, rect.right - rect.left, rect.bottom - rect.top,
-                            window->dc, 0, 0, size.cx, size.cy, SRCCOPY );
+                GetClientRect(window.hwnd, &rect);
+                StretchBlt(hdc, 0, 0, rect.right - rect.left, rect.bottom - rect.top,
+                            window.dc, 0, 0, size.cx, size.cy, SRCCOPY);
             }
             //DeleteDC(hdc);
             EndPaint(hwnd, &paint);
         }
 #ifdef HAVE_OPENGL
-        else if(window->useGl)
+        else if (window.useGl)
         {
             drawGl(window);
             return DefWindowProc(hwnd, uMsg, wParam, lParam);
@@ -1829,13 +1921,13 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
         return 0;
 
     case WM_ERASEBKGND:
-        if(window->image)
+        if (window.image)
             return 0;
         break;
 
     case WM_DESTROY:
 
-        icvRemoveWindow(window);
+        icvRemoveWindow(window_);
         // Do nothing!!!
         //PostQuitMessage(0);
         break;
@@ -1845,15 +1937,15 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
         return 0;
 
     case WM_KEYDOWN:
-        window->last_key = (int)wParam;
+        window.last_key = (int)wParam;
         return 0;
 
     case WM_SIZE:
-        window->width = LOWORD(lParam);
-        window->height = HIWORD(lParam);
+        window.width = LOWORD(lParam);
+        window.height = HIWORD(lParam);
 
 #ifdef HAVE_OPENGL
-        if (window->useGl)
+        if (window.useGl)
             resizeGl(window);
 #endif
     }
@@ -1862,24 +1954,24 @@ static LRESULT CALLBACK HighGUIProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
 }
 
 
-static LRESULT CALLBACK WindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
+static LRESULT CALLBACK WindowProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
     LRESULT ret;
 
-    if( hg_on_preprocess )
+    if (hg_on_preprocess)
     {
         int was_processed = 0;
         int rethg = hg_on_preprocess(hwnd, uMsg, wParam, lParam, &was_processed);
-        if( was_processed )
+        if (was_processed)
             return rethg;
     }
     ret = HighGUIProc(hwnd, uMsg, wParam, lParam);
 
-    if(hg_on_postprocess)
+    if (hg_on_postprocess)
     {
         int was_processed = 0;
         int rethg = hg_on_postprocess(hwnd, uMsg, wParam, lParam, &was_processed);
-        if( was_processed )
+        if (was_processed)
             return rethg;
     }
 
@@ -1887,51 +1979,56 @@ static LRESULT CALLBACK WindowProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM
 }
 
 
-static void icvUpdateTrackbar( CvTrackbar* trackbar, int pos )
+static void icvUpdateTrackbar(CvTrackbar& trackbar, int pos)
 {
     const int max_name_len = 10;
     const char* suffix = "";
     char pos_text[32];
     int name_len;
 
-    if( trackbar->data )
-        *trackbar->data = pos;
+    if (trackbar.data)
+        *trackbar.data = pos;
 
-    if( trackbar->pos != pos )
+    if (trackbar.pos != pos)
     {
-        trackbar->pos = pos;
-        if( trackbar->notify2 )
-            trackbar->notify2(pos, trackbar->userdata);
-        if( trackbar->notify )
-            trackbar->notify(pos);
-
-        name_len = (int)strlen(trackbar->name);
-
-        if( name_len > max_name_len )
+        trackbar.pos = pos;
+        if (trackbar.onChangeCallback)
+            trackbar.onChangeCallback(pos, trackbar.userdata);
+        if (trackbar.notify2)
+            trackbar.notify2(pos, trackbar.userdata);
+        if (trackbar.notify)
+            trackbar.notify(pos);
+
+        name_len = (int)trackbar.name.size();
+
+        // TODO replace C strings manipulation
+        if (name_len > max_name_len)
         {
             int start_len = max_name_len*2/3;
             int end_len = max_name_len - start_len - 2;
-            memcpy( pos_text, trackbar->name, start_len );
-            memcpy( pos_text + start_len, "...", 3 );
-            memcpy( pos_text + start_len + 3, trackbar->name + name_len - end_len, end_len + 1 );
+            memcpy(pos_text, trackbar.name.c_str(), start_len);
+            memcpy(pos_text + start_len, "...", 3);
+            memcpy(pos_text + start_len + 3, trackbar.name.c_str() + name_len - end_len, end_len + 1);
         }
         else
         {
-            memcpy( pos_text, trackbar->name, name_len + 1);
+            memcpy(pos_text, trackbar.name.c_str(), name_len + 1);
         }
 
-        sprintf( pos_text + strlen(pos_text), "%s: %d\n", suffix, pos );
-        SetWindowText( trackbar->buddy, pos_text );
+        sprintf(pos_text + strlen(pos_text), "%s: %d\n", suffix, pos);
+        SetWindowText(trackbar.buddy, pos_text);
     }
 }
 
 
-static LRESULT CALLBACK HGToolbarProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam )
+static LRESULT CALLBACK HGToolbarProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam)
 {
-    CvWindow* window = icvWindowByHWND( hwnd );
-    if(!window)
+    auto window_ = icvWindowByHWND(hwnd);
+    if (!window_)
         return DefWindowProc(hwnd, uMsg, wParam, lParam);
 
+    CvWindow& window = *window_;
+
     // Control messages processing
     switch(uMsg)
     {
@@ -1940,32 +2037,34 @@ static LRESULT CALLBACK HGToolbarProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPAR
         {
             HWND slider = (HWND)lParam;
             int pos = (int)SendMessage(slider, TBM_GETPOS, 0, 0);
-            CvTrackbar* trackbar = icvTrackbarByHWND( slider );
+            auto trackbar = icvTrackbarByHWND(slider);
 
-            if( trackbar )
+            if (trackbar)
             {
-                if( trackbar->pos != pos )
-                    icvUpdateTrackbar( trackbar, pos );
+                if (trackbar->pos != pos)
+                    icvUpdateTrackbar(*trackbar, pos);
             }
 
-            SetFocus( window->hwnd );
+            SetFocus(window.hwnd);
             return 0;
         }
 
     case WM_NCCALCSIZE:
         {
-            LRESULT ret = CallWindowProc(window->toolbar.toolBarProc, hwnd, uMsg, wParam, lParam);
+            LRESULT ret = CallWindowProc(window.toolbar.toolBarProc, hwnd, uMsg, wParam, lParam);
             int rows = (int)SendMessage(hwnd, TB_GETROWS, 0, 0);
 
-            if(window->toolbar.rows != rows)
+            if (window.toolbar.rows != rows)
             {
-                SendMessage(window->toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
-                CvTrackbar* trackbar = window->toolbar.first;
+                SendMessage(window.toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
+                auto& trakbars = window.toolbar.trackbars;
 
-                for( ; trackbar != 0; trackbar = trackbar->next )
+                for (auto it = trakbars.begin(); it != trakbars.end(); ++it)
                 {
+                    auto trackbar = *it;
+                    CV_Assert(trackbar);
                     RECT rect = { 0 };
-                    SendMessage(window->toolbar.toolbar, TB_GETITEMRECT,
+                    SendMessage(window.toolbar.toolbar, TB_GETITEMRECT,
                                (WPARAM)trackbar->id, (LPARAM)&rect);
                     MoveWindow(trackbar->hwnd, rect.left + HG_BUDDY_WIDTH, rect.top,
                                rect.right - rect.left - HG_BUDDY_WIDTH,
@@ -1973,46 +2072,63 @@ static LRESULT CALLBACK HGToolbarProc( HWND hwnd, UINT uMsg, WPARAM wParam, LPAR
                     MoveWindow(trackbar->buddy, rect.left, rect.top,
                                HG_BUDDY_WIDTH, rect.bottom - rect.top, FALSE);
                 }
-                window->toolbar.rows = rows;
+                window.toolbar.rows = rows;
             }
             return ret;
         }
     }
 
-    return CallWindowProc(window->toolbar.toolBarProc, hwnd, uMsg, wParam, lParam);
+    return CallWindowProc(window.toolbar.toolBarProc, hwnd, uMsg, wParam, lParam);
 }
 
 
 CV_IMPL void
 cvDestroyAllWindows(void)
 {
-    CvWindow* window = hg_windows;
-
-    while( window )
+    std::vector< std::shared_ptr<CvWindow> > g_windows;
+    {
+        AutoLock lock(getWindowMutex());
+        g_windows = getWindowsList();  // copy
+    }
+    for (auto it = g_windows.begin(); it != g_windows.end(); ++it)
     {
-        HWND mainhWnd = window->frame;
-        HWND hwnd = window->hwnd;
-        window = window->next;
+        auto window_ = *it;
+        if (!window_)
+            continue;
+
+        {
+            CvWindow& window = *window_;
+
+            HWND mainhWnd = window.frame;
+            HWND hwnd = window.hwnd;
 
-        SendMessage( hwnd, WM_CLOSE, 0, 0 );
-        SendMessage( mainhWnd, WM_CLOSE, 0, 0 );
+            SendMessage(hwnd, WM_CLOSE, 0, 0);
+            SendMessage(mainhWnd, WM_CLOSE, 0, 0);
+        }
+
+        window_.reset();
+    }
+    // TODO needed?
+    {
+        AutoLock lock(getWindowMutex());
+        getWindowsList().clear();
     }
 }
 
-static void showSaveDialog(CvWindow* window)
+static void showSaveDialog(CvWindow& window)
 {
-    if (!window || !window->image)
+    if (!window.image)
         return;
 
     SIZE sz;
     int channels;
     void* data;
-    if (icvGetBitmapData(window, &sz, &channels, &data))
+    if (!icvGetBitmapData(window, sz, channels, data))
         return; // nothing to save
 
     char szFileName[MAX_PATH] = "";
     // try to use window title as file name
-    GetWindowText(window->frame, szFileName, MAX_PATH);
+    GetWindowText(window.frame, szFileName, MAX_PATH);
 
     OPENFILENAME ofn;
     ZeroMemory(&ofn, sizeof(ofn));
@@ -2022,7 +2138,7 @@ static void showSaveDialog(CvWindow* window)
 #else
     ofn.lStructSize = sizeof(ofn);
 #endif
-    ofn.hwndOwner = window->hwnd;
+    ofn.hwndOwner = window.hwnd;
     ofn.lpstrFilter =
 #ifdef HAVE_PNG
                       "Portable Network Graphics files (*.png)\0*.png\0"
@@ -2075,15 +2191,22 @@ static bool handleMessage(MSG& message, int& keyCode)
     // otherwise the message was handled specifically
     bool is_processed = false;
 
-    for (CvWindow* window = hg_windows; window != 0 && is_processed == 0; window = window->next)
+    AutoLock lock(getWindowMutex());
+    auto& g_windows = getWindowsList();
+    for (auto it = g_windows.begin(); it != g_windows.end() && !is_processed; ++it)
     {
-        if (!(window->hwnd == message.hwnd || window->frame == message.hwnd))
+        auto window_ = *it;
+        if (!window_)
+            continue;
+        CvWindow& window = *window_;
+        if (!(window.hwnd == message.hwnd || window.frame == message.hwnd))
             continue;
 
         is_processed = true;
         switch (message.message)
         {
             case WM_DESTROY:
+                // fallthru
             case WM_CHAR:
                 DispatchMessage(&message);
                 keyCode = (int)message.wParam;
@@ -2099,6 +2222,20 @@ static bool handleMessage(MSG& message, int& keyCode)
                 break;
 
             case WM_KEYDOWN:
+                // Intercept Ctrl+C for copy to clipboard
+                if ('C' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
+                {
+                    ::SendMessage(message.hwnd, WM_COPY, 0, 0);
+                    return false;
+                }
+
+                // Intercept Ctrl+S for "save as" dialog
+                if ('S' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
+                {
+                    showSaveDialog(window);
+                    return false;
+                }
+
                 TranslateMessage(&message);
                 if ((message.wParam >= VK_F1 && message.wParam <= VK_F24)      ||
                     message.wParam == VK_HOME   || message.wParam == VK_END    ||
@@ -2113,13 +2250,7 @@ static bool handleMessage(MSG& message, int& keyCode)
                     return true;
                 }
 
-                // Intercept Ctrl+C for copy to clipboard
-                if ('C' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
-                    ::SendMessage(message.hwnd, WM_COPY, 0, 0);
-
-                // Intercept Ctrl+S for "save as" dialog
-                if ('S' == message.wParam && (::GetKeyState(VK_CONTROL) >> 15))
-                    showSaveDialog(window);
+                // fallthru
 
             default:
                 DispatchMessage(&message);
@@ -2140,7 +2271,7 @@ static bool handleMessage(MSG& message, int& keyCode)
 /*
  * process until queue is empty but don't wait.
  */
-int cv::pollKey()
+int pollKey_W32()
 {
     CV_TRACE_FUNCTION();
     for(;;)
@@ -2156,7 +2287,7 @@ int cv::pollKey()
 }
 
 CV_IMPL int
-cvWaitKey( int delay )
+cvWaitKey(int delay)
 {
     int64 time0 = cv::getTickCount();
     int64 timeEnd = time0 + (int64)(delay * 0.001f * cv::getTickFrequency());
@@ -2165,9 +2296,9 @@ cvWaitKey( int delay )
     {
         MSG message;
 
-        if( (delay <= 0) && hg_windows)
+        if ((delay <= 0) && !getWindowsList().empty())
             GetMessage(&message, 0, 0, 0);
-        else if( PeekMessage(&message, 0, 0, 0, PM_REMOVE) == FALSE )
+        else if (PeekMessage(&message, 0, 0, 0, PM_REMOVE) == FALSE)
         {
             int64 t = cv::getTickCount();
             if (t - timeEnd >= 0)
@@ -2183,110 +2314,135 @@ cvWaitKey( int delay )
 }
 
 
-static CvTrackbar*
-icvFindTrackbarByName( const CvWindow* window, const char* name )
+static
+std::shared_ptr<CvTrackbar> icvFindTrackbarByName(CvWindow& window, const std::string& name)
 {
-    CvTrackbar* trackbar = window->toolbar.first;
-
-    for( ; trackbar != 0 && strcmp( trackbar->name, name ) != 0; trackbar = trackbar->next )
-        ;
-
-    return trackbar;
+    auto trackbars = window.toolbar.trackbars;
+    for (auto it = trackbars.begin(); it != trackbars.end(); ++it)
+    {
+        auto& trackbar = *it;
+        CV_Assert(trackbar);
+        if (trackbar->name == name)
+            return trackbar;
+    }
+    return std::shared_ptr<CvTrackbar>();
+}
+static inline
+std::shared_ptr<CvTrackbar> icvFindTrackbarByName(const std::shared_ptr<CvWindow>& window, const std::string& name)
+{
+    CV_Assert(window);
+    return icvFindTrackbarByName(window, name);
 }
 
+static
+std::shared_ptr<CvTrackbar> createTrackbar_(CvWindow& window, const std::string& trackbar_name,
+    int count,
+    TrackbarCallback onChange, void* userdata);
 
 static int
-icvCreateTrackbar( const char* trackbar_name, const char* window_name,
-                   int* val, int count, CvTrackbarCallback on_notify,
-                   CvTrackbarCallback2 on_notify2, void* userdata )
+icvCreateTrackbar(const char* trackbar_name, const char* window_name,
+                  int* val, int count, CvTrackbarCallback on_notify,
+                  CvTrackbarCallback2 on_notify2, void* userdata)
 {
-    int result = 0;
+    CV_FUNCNAME("icvCreateTrackbar");
 
-    CV_FUNCNAME( "icvCreateTrackbar" );
+    AutoLock lock(getWindowMutex());
 
-    __BEGIN__;
+    if (!window_name || !trackbar_name)
+        CV_Error(Error::StsNullPtr, "NULL window or trackbar name");
 
-    char slider_name[32];
-    CvWindow* window = 0;
-    CvTrackbar* trackbar = 0;
-    int pos = 0;
+    if (count < 0)
+        CV_Error(Error::StsOutOfRange, "Bad trackbar maximal value");
 
-    if( !window_name || !trackbar_name )
-        CV_ERROR( CV_StsNullPtr, "NULL window or trackbar name" );
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
 
-    if( count < 0 )
-        CV_ERROR( CV_StsOutOfRange, "Bad trackbar maximal value" );
+    auto trackbar = icvFindTrackbarByName(*window, trackbar_name);
+    if (!trackbar)
+        trackbar = createTrackbar_(*window, trackbar_name, count, nullptr, userdata);
+    CV_Assert(trackbar);
 
-    window = icvFindWindowByName(window_name);
-    if( !window )
-        EXIT;
+    trackbar->notify = on_notify;
+    trackbar->notify2 = on_notify2;
+    trackbar->userdata = userdata;
+    trackbar->data = val;
 
-    trackbar = icvFindTrackbarByName(window,trackbar_name);
-    if( !trackbar )
-    {
-        TBBUTTON tbs = {};
-        TBBUTTONINFO tbis = {};
-        RECT rect = { 0 };
-        int bcount;
-        int len = (int)strlen( trackbar_name );
+    return 1;
+}
 
-        // create toolbar if it is not created yet
-        if( !window->toolbar.toolbar )
-        {
-            const int default_height = 30;
-
-            // CreateToolbarEx is deprecated and forces linking against Comctl32.lib.
-            window->toolbar.toolbar = CreateWindowEx(0, TOOLBARCLASSNAME, NULL,
-                                        WS_CHILD | CCS_TOP | TBSTYLE_WRAPABLE | BTNS_AUTOSIZE | BTNS_BUTTON,
-                                        0, 0, 0, 0,
-                                        window->frame, NULL, GetModuleHandle(NULL), NULL);
-            // CreateToolbarEx automatically sends this but CreateWindowEx doesn't.
-            SendMessage(window->toolbar.toolbar, TB_BUTTONSTRUCTSIZE, (WPARAM)sizeof(TBBUTTON), 0);
-
-            GetClientRect(window->frame, &rect);
-            MoveWindow( window->toolbar.toolbar, 0, 0,
-                        rect.right - rect.left, default_height, TRUE);
-            SendMessage(window->toolbar.toolbar, TB_AUTOSIZE, 0, 0);
-            ShowWindow(window->toolbar.toolbar, SW_SHOW);
-
-            window->toolbar.first = 0;
-            window->toolbar.pos = 0;
-            window->toolbar.rows = 0;
-            window->toolbar.toolBarProc =
-                (WNDPROC)icvGetWindowLongPtr(window->toolbar.toolbar, CV_WNDPROC);
-
-            icvUpdateWindowPos(window);
-
-            // Subclassing from toolbar
-            icvSetWindowLongPtr(window->toolbar.toolbar, CV_WNDPROC, HGToolbarProc);
-            icvSetWindowLongPtr(window->toolbar.toolbar, CV_USERDATA, window);
-        }
+static void createToolbar_(CvWindow& window)
+{
+    CV_Assert(!window.toolbar.toolbar);
+
+    const int default_height = 30;
+
+    // CreateToolbarEx is deprecated and forces linking against Comctl32.lib.
+    window.toolbar.toolbar = CreateWindowEx(0, TOOLBARCLASSNAME, NULL,
+                                WS_CHILD | CCS_TOP | TBSTYLE_WRAPABLE | BTNS_AUTOSIZE | BTNS_BUTTON,
+                                0, 0, 0, 0,
+                                window.frame, NULL, GetModuleHandle(NULL), NULL);
+    // CreateToolbarEx automatically sends this but CreateWindowEx doesn't.
+    SendMessage(window.toolbar.toolbar, TB_BUTTONSTRUCTSIZE, (WPARAM)sizeof(TBBUTTON), 0);
+
+    RECT rect;
+    GetClientRect(window.frame, &rect);
+    MoveWindow(window.toolbar.toolbar, 0, 0,
+               rect.right - rect.left, default_height, TRUE);
+    SendMessage(window.toolbar.toolbar, TB_AUTOSIZE, 0, 0);
+    ShowWindow(window.toolbar.toolbar, SW_SHOW);
+
+    window.toolbar.pos = 0;
+    window.toolbar.rows = 0;
+    window.toolbar.toolBarProc =
+        (WNDPROC)icvGetWindowLongPtr(window.toolbar.toolbar, CV_WNDPROC);
+
+    icvUpdateWindowPos(window);
 
-        /* Retrieve current buttons count */
-        bcount = (int)SendMessage(window->toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
+    // Subclassing from toolbar
+    icvSetWindowLongPtr(window.toolbar.toolbar, CV_WNDPROC, HGToolbarProc);
+    icvSetWindowLongPtr(window.toolbar.toolbar, CV_USERDATA, (void*)&window);
 
-        if(bcount > 1)
-        {
-            /* If this is not the first button then we need to
-            separate it from the previous one */
-            tbs.iBitmap = 0;
-            tbs.idCommand = bcount; // Set button id to it's number
-            tbs.iString = 0;
-            tbs.fsStyle = TBSTYLE_SEP;
-            tbs.fsState = TBSTATE_ENABLED;
-            SendMessage(window->toolbar.toolbar, TB_ADDBUTTONS, 1, (LPARAM)&tbs);
-
-            // Retrieve current buttons count
-            bcount = (int)SendMessage(window->toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
-        }
+}
+
+static
+std::shared_ptr<CvTrackbar> createTrackbar_(CvWindow& window, const std::string& trackbar_name,
+    int count,
+    TrackbarCallback onChange, void* userdata)
+{
+    // create toolbar if it is not created yet
+    if (!window.toolbar.toolbar)
+    {
+        createToolbar_(window);
+    }
 
-        /* Add a button which we're going to cover with the slider */
+    TBBUTTON tbs = {};
+
+    /* Retrieve current buttons count */
+    int bcount = (int)SendMessage(window.toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
+
+    if (bcount > 1)
+    {
+        /* If this is not the first button then we need to
+        separate it from the previous one */
         tbs.iBitmap = 0;
         tbs.idCommand = bcount; // Set button id to it's number
+        tbs.iString = 0;
+        tbs.fsStyle = TBSTYLE_SEP;
         tbs.fsState = TBSTATE_ENABLED;
+        SendMessage(window.toolbar.toolbar, TB_ADDBUTTONS, 1, (LPARAM)&tbs);
+
+        // Retrieve current buttons count
+        bcount = (int)SendMessage(window.toolbar.toolbar, TB_BUTTONCOUNT, 0, 0);
+    }
+
+    /* Add a button which we're going to cover with the slider */
+    tbs.iBitmap = 0;
+    tbs.idCommand = bcount; // Set button id to it's number
+    tbs.fsState = TBSTATE_ENABLED;
 #if 0/*!defined WIN64 && !defined EM64T*/
-        tbs.fsStyle = 0;
-        tbs.iString = 0;
+    tbs.fsStyle = 0;
+    tbs.iString = 0;
 #else
 
 #ifndef TBSTYLE_AUTOSIZE
@@ -2296,320 +2452,640 @@ icvCreateTrackbar( const char* trackbar_name, const char* window_name,
 #ifndef TBSTYLE_GROUP
 #define TBSTYLE_GROUP           0x0004
 #endif
-        //tbs.fsStyle = TBSTYLE_AUTOSIZE;
-        tbs.fsStyle = TBSTYLE_GROUP;
-        tbs.iString = (INT_PTR)trackbar_text;
+    //tbs.fsStyle = TBSTYLE_AUTOSIZE;
+    tbs.fsStyle = TBSTYLE_GROUP;
+    tbs.iString = (INT_PTR)trackbar_text;
 #endif
-        SendMessage(window->toolbar.toolbar, TB_ADDBUTTONS, 1, (LPARAM)&tbs);
-
-        /* Adjust button size to the slider */
-        tbis.cbSize = sizeof(tbis);
-        tbis.dwMask = TBIF_SIZE;
-
-        GetClientRect(window->hwnd, &rect);
-        tbis.cx = (unsigned short)(rect.right - rect.left);
-
-        SendMessage(window->toolbar.toolbar, TB_SETBUTTONINFO,
-            (WPARAM)tbs.idCommand, (LPARAM)&tbis);
-
-        /* Get button pos */
-        SendMessage(window->toolbar.toolbar, TB_GETITEMRECT,
-            (WPARAM)tbs.idCommand, (LPARAM)&rect);
-
-        /* Create a slider */
-        trackbar = (CvTrackbar*)cvAlloc( sizeof(CvTrackbar) + len + 1 );
-        trackbar->signature = CV_TRACKBAR_MAGIC_VAL;
-        trackbar->notify = 0;
-        trackbar->notify2 = 0;
-        trackbar->parent = window;
-        trackbar->pos = 0;
-        trackbar->data = 0;
-        trackbar->id = bcount;
-        trackbar->next = window->toolbar.first;
-        trackbar->name = (char*)(trackbar + 1);
-        memcpy( trackbar->name, trackbar_name, len + 1 );
-        window->toolbar.first = trackbar;
-
-        sprintf(slider_name, "Trackbar%p", val);
-        trackbar->hwnd = CreateWindowEx(0, TRACKBAR_CLASS, slider_name,
-                            WS_CHILD | WS_VISIBLE | TBS_AUTOTICKS |
-                            TBS_FIXEDLENGTH | TBS_HORZ | TBS_BOTTOM,
-                            rect.left + HG_BUDDY_WIDTH, rect.top,
-                            rect.right - rect.left - HG_BUDDY_WIDTH,
-                            rect.bottom - rect.top, window->toolbar.toolbar,
-                            (HMENU)(size_t)bcount, hg_hinstance, 0);
-
-        sprintf(slider_name,"Buddy%p", val);
-        trackbar->buddy = CreateWindowEx(0, "STATIC", slider_name,
-                            WS_CHILD | SS_RIGHT,
-                            rect.left, rect.top,
-                            HG_BUDDY_WIDTH, rect.bottom - rect.top,
-                            window->toolbar.toolbar, 0, hg_hinstance, 0);
-
-        icvSetWindowLongPtr( trackbar->hwnd, CV_USERDATA, trackbar );
-
-        /* Minimize the number of rows */
-        SendMessage( window->toolbar.toolbar, TB_SETROWS,
-                     MAKEWPARAM(1, FALSE), (LPARAM)&rect );
-    }
-    else
-    {
-        trackbar->data = 0;
-        trackbar->notify = 0;
-        trackbar->notify2 = 0;
-    }
+    SendMessage(window.toolbar.toolbar, TB_ADDBUTTONS, 1, (LPARAM)&tbs);
+
+    TBBUTTONINFO tbis = {};
+
+    /* Adjust button size to the slider */
+    tbis.cbSize = sizeof(tbis);
+    tbis.dwMask = TBIF_SIZE;
+
+    RECT rect = { 0 };
+    GetClientRect(window.hwnd, &rect);
+    tbis.cx = (unsigned short)(rect.right - rect.left);
+
+    SendMessage(window.toolbar.toolbar, TB_SETBUTTONINFO,
+        (WPARAM)tbs.idCommand, (LPARAM)&tbis);
+
+    /* Get button pos */
+    SendMessage(window.toolbar.toolbar, TB_GETITEMRECT,
+        (WPARAM)tbs.idCommand, (LPARAM)&rect);
+
+    /* Create a slider */
+    auto trackbar = std::make_shared<CvTrackbar>(window, trackbar_name);
+    trackbar->id = bcount;
+    window.toolbar.trackbars.push_back(trackbar);
+
+    auto slider_name = cv::format("Trackbar%p", trackbar.get());
+    trackbar->hwnd = CreateWindowEx(0, TRACKBAR_CLASS, slider_name.c_str(),
+                        WS_CHILD | WS_VISIBLE | TBS_AUTOTICKS |
+                        TBS_FIXEDLENGTH | TBS_HORZ | TBS_BOTTOM,
+                        rect.left + HG_BUDDY_WIDTH, rect.top,
+                        rect.right - rect.left - HG_BUDDY_WIDTH,
+                        rect.bottom - rect.top, window.toolbar.toolbar,
+                        (HMENU)(size_t)bcount, hg_hinstance, 0);
+
+    slider_name = cv::format("Buddy%p", trackbar.get());
+    trackbar->buddy = CreateWindowEx(0, "STATIC", slider_name.c_str(),
+                        WS_CHILD | SS_RIGHT,
+                        rect.left, rect.top,
+                        HG_BUDDY_WIDTH, rect.bottom - rect.top,
+                        window.toolbar.toolbar, 0, hg_hinstance, 0);
+
+    icvSetWindowLongPtr(trackbar->hwnd, CV_USERDATA, (void*)trackbar.get());
+
+    /* Minimize the number of rows */
+    SendMessage(window.toolbar.toolbar, TB_SETROWS,
+                MAKEWPARAM(1, FALSE), (LPARAM)&rect);
 
     trackbar->maxval = count;
 
     /* Adjust slider parameters */
     SendMessage(trackbar->hwnd, TBM_SETRANGEMIN, (WPARAM)TRUE, (LPARAM)0);
     SendMessage(trackbar->hwnd, TBM_SETRANGEMAX, (WPARAM)TRUE, (LPARAM)count);
-    SendMessage(trackbar->hwnd, TBM_SETTICFREQ, (WPARAM)1, (LPARAM)0 );
-    if( val )
-        pos = *val;
+    SendMessage(trackbar->hwnd, TBM_SETTICFREQ, (WPARAM)1, (LPARAM)0);
 
-    SendMessage(trackbar->hwnd, TBM_SETPOS, (WPARAM)TRUE, (LPARAM)pos );
-    SendMessage(window->toolbar.toolbar, TB_AUTOSIZE, 0, 0);
+    int pos = 0;
+    SendMessage(trackbar->hwnd, TBM_SETPOS, (WPARAM)TRUE, (LPARAM)pos);
+    SendMessage(window.toolbar.toolbar, TB_AUTOSIZE, 0, 0);
 
     trackbar->pos = -1;
-    icvUpdateTrackbar( trackbar, pos );
-    ShowWindow( trackbar->buddy, SW_SHOW );
-    ShowWindow( trackbar->hwnd, SW_SHOW );
-
-    trackbar->notify = on_notify;
-    trackbar->notify2 = on_notify2;
-    trackbar->userdata = userdata;
-    trackbar->data = val;
+    icvUpdateTrackbar(*trackbar, pos);
+    ShowWindow(trackbar->buddy, SW_SHOW);
+    ShowWindow(trackbar->hwnd, SW_SHOW);
 
     /* Resize the window to reflect the toolbar resizing*/
     icvUpdateWindowPos(window);
 
-    result = 1;
-
-    __END__;
+    trackbar->onChangeCallback = onChange;
+    trackbar->userdata = userdata;
 
-    return result;
+    return trackbar;
 }
 
 CV_IMPL int
-cvCreateTrackbar( const char* trackbar_name, const char* window_name,
-                  int* val, int count, CvTrackbarCallback on_notify )
+cvCreateTrackbar(const char* trackbar_name, const char* window_name,
+                 int* val, int count, CvTrackbarCallback on_notify)
 {
-    return icvCreateTrackbar( trackbar_name, window_name, val, count,
-        on_notify, 0, 0 );
+    return icvCreateTrackbar(trackbar_name, window_name, val, count,
+        on_notify, 0, 0);
 }
 
 CV_IMPL int
-cvCreateTrackbar2( const char* trackbar_name, const char* window_name,
-                   int* val, int count, CvTrackbarCallback2 on_notify2,
-                   void* userdata )
+cvCreateTrackbar2(const char* trackbar_name, const char* window_name,
+                  int* val, int count, CvTrackbarCallback2 on_notify2,
+                  void* userdata)
 {
-    return icvCreateTrackbar( trackbar_name, window_name, val, count,
-        0, on_notify2, userdata );
+    return icvCreateTrackbar(trackbar_name, window_name, val, count,
+        0, on_notify2, userdata);
 }
 
 CV_IMPL void
-cvSetMouseCallback( const char* window_name, CvMouseCallback on_mouse, void* param )
+cvSetMouseCallback(const char* name, CvMouseCallback on_mouse, void* param)
 {
-    CV_FUNCNAME( "cvSetMouseCallback" );
-
-    __BEGIN__;
+    CV_FUNCNAME("cvSetMouseCallback");
 
-    CvWindow* window = 0;
+    if (!name)
+        CV_Error(Error::StsNullPtr, "NULL window name");
 
-    if( !window_name )
-        CV_ERROR( CV_StsNullPtr, "NULL window name" );
+    AutoLock lock(getWindowMutex());
 
-    window = icvFindWindowByName(window_name);
-    if( !window )
-        EXIT;
+    auto window = icvFindWindowByName(name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", name));
 
     window->on_mouse = on_mouse;
     window->on_mouse_param = param;
-
-    __END__;
 }
 
 
-CV_IMPL int cvGetTrackbarPos( const char* trackbar_name, const char* window_name )
+CV_IMPL int cvGetTrackbarPos(const char* trackbar_name, const char* window_name)
 {
-    int pos = -1;
-
-    CV_FUNCNAME( "cvGetTrackbarPos" );
+    CV_FUNCNAME("cvGetTrackbarPos");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
-    CvWindow* window;
-    CvTrackbar* trackbar = 0;
+    if (trackbar_name == 0 || window_name == 0)
+        CV_Error(Error::StsNullPtr, "NULL trackbar or window name");
 
-    if( trackbar_name == 0 || window_name == 0 )
-        CV_ERROR( CV_StsNullPtr, "NULL trackbar or window name" );
-
-    window = icvFindWindowByName( window_name );
-    if( window )
-        trackbar = icvFindTrackbarByName( window, trackbar_name );
-
-    if( trackbar )
-        pos = trackbar->pos;
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
 
-    __END__;
+    auto trackbar = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar)
+        CV_Error_(Error::StsNullPtr, ("NULL trackbar: '%s'", trackbar_name));
 
-    return pos;
+    return trackbar->pos;
 }
 
 
-CV_IMPL void cvSetTrackbarPos( const char* trackbar_name, const char* window_name, int pos )
+CV_IMPL void cvSetTrackbarPos(const char* trackbar_name, const char* window_name, int pos)
 {
-    CV_FUNCNAME( "cvSetTrackbarPos" );
+    CV_FUNCNAME("cvSetTrackbarPos");
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
-    CvWindow* window;
-    CvTrackbar* trackbar = 0;
+    if (trackbar_name == 0 || window_name == 0)
+        CV_Error(Error::StsNullPtr, "NULL trackbar or window name");
 
-    if( trackbar_name == 0 || window_name == 0 )
-        CV_ERROR( CV_StsNullPtr, "NULL trackbar or window name" );
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
 
-    window = icvFindWindowByName( window_name );
-    if( window )
-        trackbar = icvFindTrackbarByName( window, trackbar_name );
+    auto trackbar = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar)
+        CV_Error_(Error::StsNullPtr, ("NULL trackbar: '%s'", trackbar_name));
 
-    if( trackbar )
     {
-        if( pos < 0 )
+        if (pos < 0)
             pos = 0;
 
-        if( pos > trackbar->maxval )
+        if (pos > trackbar->maxval)
             pos = trackbar->maxval;
 
-        SendMessage( trackbar->hwnd, TBM_SETPOS, (WPARAM)TRUE, (LPARAM)pos );
-        icvUpdateTrackbar( trackbar, pos );
+        SendMessage(trackbar->hwnd, TBM_SETPOS, (WPARAM)TRUE, (LPARAM)pos);
+        icvUpdateTrackbar(*trackbar, pos);
     }
-
-    __END__;
 }
 
 
 CV_IMPL void cvSetTrackbarMax(const char* trackbar_name, const char* window_name, int maxval)
 {
-    CV_FUNCNAME( "cvSetTrackbarMax" );
+    CV_FUNCNAME("cvSetTrackbarMax");
+
+    if (trackbar_name == 0 || window_name == 0)
+    {
+        CV_Error(Error::StsNullPtr, "NULL trackbar or window name");
+    }
 
-    __BEGIN__;
+    AutoLock lock(getWindowMutex());
 
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
+
+    auto trackbar = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar)
+        CV_Error_(Error::StsNullPtr, ("NULL trackbar: '%s'", trackbar_name));
+
+    // FIXIT
     if (maxval >= 0)
     {
-        CvWindow* window = 0;
-        CvTrackbar* trackbar = 0;
-        if (trackbar_name == 0 || window_name == 0)
+        // The position will be min(pos, maxval).
+        trackbar->maxval = (trackbar->minval>maxval)?trackbar->minval:maxval;
+        SendMessage(trackbar->hwnd, TBM_SETRANGEMAX, (WPARAM)TRUE, (LPARAM)maxval);
+    }
+}
+
+
+CV_IMPL void cvSetTrackbarMin(const char* trackbar_name, const char* window_name, int minval)
+{
+    CV_FUNCNAME("cvSetTrackbarMin");
+
+    if (trackbar_name == 0 || window_name == 0)
+    {
+        CV_Error(Error::StsNullPtr, "NULL trackbar or window name");
+    }
+
+    AutoLock lock(getWindowMutex());
+
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
+
+    auto trackbar = icvFindTrackbarByName(window, trackbar_name);
+    if (!trackbar)
+        CV_Error_(Error::StsNullPtr, ("NULL trackbar: '%s'", trackbar_name));
+
+    // FIXIT
+    if (minval >= 0)
+    {
+        // The position will be min(pos, maxval).
+        trackbar->minval = (minval<trackbar->maxval)?minval:trackbar->maxval;
+        SendMessage(trackbar->hwnd, TBM_SETRANGEMIN, (WPARAM)TRUE, (LPARAM)minval);
+    }
+}
+
+
+CV_IMPL void* cvGetWindowHandle(const char* window_name)
+{
+    CV_FUNCNAME("cvGetWindowHandle");
+
+    AutoLock lock(getWindowMutex());
+
+    if (window_name == 0)
+        CV_Error(Error::StsNullPtr, "NULL window name");
+
+    auto window = icvFindWindowByName(window_name);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%s'", window_name));
+
+    return (void*)window->hwnd;
+}
+
+// FIXIT: result is not safe to use
+CV_IMPL const char* cvGetWindowName(void* window_handle)
+{
+    CV_FUNCNAME("cvGetWindowName");
+
+    AutoLock lock(getWindowMutex());
+
+    if (window_handle == 0)
+        CV_Error(Error::StsNullPtr, "NULL window handle");
+
+    auto window = icvWindowByHWND((HWND)window_handle);
+    if (!window)
+        CV_Error_(Error::StsNullPtr, ("NULL window: '%p'", window_handle));
+
+    return window->name.c_str();
+}
+
+
+CV_IMPL void
+cvSetPreprocessFuncWin32_(const void* callback)
+{
+    hg_on_preprocess = (CvWin32WindowCallback)callback;
+}
+
+CV_IMPL void
+cvSetPostprocessFuncWin32_(const void* callback)
+{
+    hg_on_postprocess = (CvWin32WindowCallback)callback;
+}
+
+
+
+namespace cv { namespace impl {
+
+using namespace cv::highgui_backend;
+
+class Win32UITrackbar;
+
+class Win32UIWindow
+        : public UIWindow
+        , public std::enable_shared_from_this<Win32UIWindow>
+{
+protected:
+    const std::string name_;
+    std::weak_ptr<CvWindow> window_;
+    std::map<std::string, std::shared_ptr<Win32UITrackbar> > trackbars_;
+public:
+    Win32UIWindow(const std::string& name, const std::shared_ptr<CvWindow>& window)
+        : name_(name)
+        , window_(window)
+    {
+        // nothing
+    }
+
+    ~Win32UIWindow() CV_OVERRIDE
+    {
+        if (!window_.expired())
+            destroy();
+        CV_LOG_DEBUG(NULL, "OpenCV/UI/Win32UI: Win32UIWindow(" << name_ << ") is disposed");
+    }
+
+    const std::string& getID() const CV_OVERRIDE { return name_; }
+
+    bool isActive() const CV_OVERRIDE { return !window_.expired(); }
+
+    void destroy() CV_OVERRIDE
+    {
+        cv::AutoLock lock(getWindowMutex());
+        if (!window_.expired())
         {
-            CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+            auto window = window_.lock();
+            if (window)
+                window->destroy();
+            window_.reset();
         }
+    }
+
+    void imshow(InputArray image) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        Mat image_mat = image.getMat();
+        showImage_(window, image_mat);
+    }
 
-        window = icvFindWindowByName(window_name);
-        if (window)
+    double getProperty(int prop) const CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        // see cvGetWindowProperty
+        switch ((WindowPropertyFlags)prop)
         {
-            trackbar = icvFindTrackbarByName(window, trackbar_name);
-            if (trackbar)
-            {
-                // The position will be min(pos, maxval).
-                trackbar->maxval = (trackbar->minval>maxval)?trackbar->minval:maxval;
-                SendMessage(trackbar->hwnd, TBM_SETRANGEMAX, (WPARAM)TRUE, (LPARAM)maxval);
-            }
+        case WND_PROP_FULLSCREEN:
+            return (double)window.status;
+
+        case WND_PROP_AUTOSIZE:
+            return (window.flags & WINDOW_AUTOSIZE) ? 1.0 : 0.0;
+
+        case WND_PROP_ASPECT_RATIO:
+            return static_cast<double>(window.width) / window.height;
+
+#ifdef HAVE_OPENGL
+        case WND_PROP_OPENGL:
+            return window.useGl ? 1.0 : 0.0;
+#endif
+
+        case WND_PROP_VISIBLE:
+            return 1.0;
+
+        case WND_PROP_TOPMOST:
+            return getPropTopmost_(window);
+
+        case WND_PROP_VSYNC:
+            return getPropVsync_(window);
+
+        // don't use default, add unsupported cases below:
+        // case WND_PROP_UNSUPPORTED:  // fallthru
+        //    break;
         }
+        return std::numeric_limits<double>::quiet_NaN();
     }
 
-    __END__;
-}
+    bool setProperty(int prop, double value) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        // see cvSetWindowProperty
+        switch ((WindowPropertyFlags)prop)
+        {
+        case WND_PROP_FULLSCREEN:
+            if (value != WINDOW_NORMAL && value != WINDOW_FULLSCREEN)  // bad arg
+                break;
+            setModeWindow_(window, (int)value);
+            return true;
 
+        case WND_PROP_TOPMOST:
+            return setPropTopmost_(window, value != 0.0);
 
-CV_IMPL void cvSetTrackbarMin(const char* trackbar_name, const char* window_name, int minval)
-{
-    CV_FUNCNAME( "cvSetTrackbarMin" );
+        case WND_PROP_VSYNC:
+            return setPropVsync_(window, value != 0.0);
 
-    __BEGIN__;
+        // don't use default, add unsupported cases below:
+        // case WND_PROP_UNSUPPORTED:  // fallthru
+        case WND_PROP_AUTOSIZE:  // fallthru
+        case WND_PROP_ASPECT_RATIO:  // fallthru
+        case WND_PROP_OPENGL:  // fallthru
+        case WND_PROP_VISIBLE:  // fallthru
+            break;
+        }
+        return false;
+    }
 
-    if (minval >= 0)
+    void resize(int width, int height) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        resizeWindow_(window, Size(width, height));
+    }
+
+    void move(int x, int y) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        moveWindow_(window, Point(x, y));
+    }
+
+    Rect getImageRect() const CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        return getImageRect_(window);
+    }
+
+    void setTitle(const std::string& title) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        if (!SetWindowText(window.frame, title.c_str()))
+            CV_Error_(Error::StsError, ("Failed to set \"%s\" window title to \"%s\"", window.name.c_str(), title.c_str()));
+    }
+
+    void setMouseCallback(MouseCallback onMouse, void* userdata /*= 0*/) CV_OVERRIDE
     {
-        CvWindow* window = 0;
-        CvTrackbar* trackbar = 0;
-        if (trackbar_name == 0 || window_name == 0)
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        window.on_mouse = onMouse;
+        window.on_mouse_param = userdata;
+    }
+
+    std::shared_ptr<UITrackbar> createTrackbar(
+        const std::string& name,
+        int count,
+        TrackbarCallback onChange /*= 0*/,
+        void* userdata /*= 0*/
+    ) CV_OVERRIDE
+    {
+        auto window_ptr = window_.lock();
+        CV_Assert(window_ptr);
+        CvWindow& window = *window_ptr;
+        CV_LOG_INFO(NULL, "OpenCV/UI: Creating Win32UI trackbar at '" << name_ << "': '" << name << "'");
+        auto trackbar = createTrackbar_(window, name, count, onChange, userdata);
+        auto ui_trackbar = std::make_shared<Win32UITrackbar>(name, trackbar, shared_from_this());
         {
-            CV_ERROR(CV_StsNullPtr, "NULL trackbar or window name");
+            cv::AutoLock lock(getWindowMutex());
+            trackbars_.emplace(name, ui_trackbar);
         }
+        return std::static_pointer_cast<UITrackbar>(ui_trackbar);
+    }
 
-        window = icvFindWindowByName(window_name);
-        if (window)
+    std::shared_ptr<UITrackbar> findTrackbar(const std::string& name) CV_OVERRIDE
+    {
+        cv::AutoLock lock(getWindowMutex());
+        auto i = trackbars_.find(name);
+        if (i != trackbars_.end())
         {
-            trackbar = icvFindTrackbarByName(window, trackbar_name);
-            if (trackbar)
-            {
-                // The position will be min(pos, maxval).
-                trackbar->minval = (minval<trackbar->maxval)?minval:trackbar->maxval;
-                SendMessage(trackbar->hwnd, TBM_SETRANGEMIN, (WPARAM)TRUE, (LPARAM)minval);
-            }
+            return std::static_pointer_cast<UITrackbar>(i->second);
         }
+        return std::shared_ptr<UITrackbar>();
     }
-
-    __END__;
-}
+};  // Win32UIWindow
 
 
-CV_IMPL void* cvGetWindowHandle( const char* window_name )
+class Win32UITrackbar : public UITrackbar
 {
-    void* hwnd = 0;
+protected:
+    /*const*/ std::string name_;
+    std::weak_ptr<CvTrackbar> trackbar_;
+    std::weak_ptr<Win32UIWindow> parent_;
+    std::map<std::string, std::shared_ptr<Win32UITrackbar> > trackbars_;
+public:
+    Win32UITrackbar(const std::string& name, const std::shared_ptr<CvTrackbar>& trackbar, const std::shared_ptr<Win32UIWindow>& parent)
+        : trackbar_(trackbar)
+        , parent_(parent)
+    {
+        name_ = std::string("<") + name + ">@" + parent->getID();
+    }
 
-    CV_FUNCNAME( "cvGetWindowHandle" );
+    ~Win32UITrackbar() CV_OVERRIDE
+    {
+        if (!trackbar_.expired())
+            destroy();
+        CV_LOG_DEBUG(NULL, "OpenCV/UI/Win32UI: Win32UITrackbar(" << name_ << ") is disposed");
+    }
 
-    __BEGIN__;
+    const std::string& getID() const CV_OVERRIDE { return name_; }
 
-    CvWindow* window;
+    bool isActive() const CV_OVERRIDE { return !trackbar_.expired(); }
 
-    if( window_name == 0 )
-        CV_ERROR( CV_StsNullPtr, "NULL window name" );
+    void destroy() CV_OVERRIDE
+    {
+        // nothing (destroyed with parent window, dedicated trackbar removal is not supported)
+    }
 
-    window = icvFindWindowByName( window_name );
-    if( window )
-        hwnd = (void*)window->hwnd;
+    int getPos() const CV_OVERRIDE
+    {
+        auto trackbar_ptr = trackbar_.lock();
+        CV_Assert(trackbar_ptr);
+        CvTrackbar& trackbar = *trackbar_ptr;
+        return trackbar.pos;
+    }
+    void setPos(int pos) CV_OVERRIDE
+    {
+        auto trackbar_ptr = trackbar_.lock();
+        CV_Assert(trackbar_ptr);
+        CvTrackbar& trackbar = *trackbar_ptr;
+        SendMessage(trackbar.hwnd, TBM_SETPOS, (WPARAM)TRUE, (LPARAM)pos);
+        icvUpdateTrackbar(trackbar, pos);
+    }
 
-    __END__;
+    cv::Range getRange() const CV_OVERRIDE
+    {
+        auto trackbar_ptr = trackbar_.lock();
+        CV_Assert(trackbar_ptr);
+        CvTrackbar& trackbar = *trackbar_ptr;
+        return cv::Range(trackbar.minval, trackbar.maxval);
+    }
 
-    return hwnd;
-}
+    void setRange(const cv::Range& range) CV_OVERRIDE
+    {
+        auto trackbar_ptr = trackbar_.lock();
+        CV_Assert(trackbar_ptr);
+        CvTrackbar& trackbar = *trackbar_ptr;
+        CV_CheckLE(range.start, range.end, "Invalid trackbar range");
+        trackbar.minval = range.start;
+        trackbar.maxval = range.start;
+        SendMessage(trackbar.hwnd, TBM_SETRANGEMIN, (WPARAM)TRUE, (LPARAM)trackbar.minval);
+        SendMessage(trackbar.hwnd, TBM_SETRANGEMAX, (WPARAM)TRUE, (LPARAM)trackbar.maxval);
+    }
+};  // Win32UITrackbar
 
 
-CV_IMPL const char* cvGetWindowName( void* window_handle )
+class Win32BackendUI : public UIBackend
 {
-    const char* window_name = "";
+public:
+    ~Win32BackendUI() CV_OVERRIDE
+    {
+        destroyAllWindows();
+    }
 
-    CV_FUNCNAME( "cvGetWindowName" );
+    void destroyAllWindows() CV_OVERRIDE
+    {
+        cvDestroyAllWindows();
+    }
 
-    __BEGIN__;
+    // namedWindow
+    virtual std::shared_ptr<UIWindow> createWindow(
+        const std::string& winname,
+        int flags
+    ) CV_OVERRIDE
+    {
+        CV_LOG_INFO(NULL, "OpenCV/UI: Creating Win32UI window: " << winname << " (" << flags << ")");
+        auto window = namedWindow_(winname, flags);
+        auto ui_window = std::make_shared<Win32UIWindow>(winname, window);
+        return ui_window;
+    }
 
-    CvWindow* window;
+    int waitKeyEx(int delay) CV_OVERRIDE
+    {
+        return cvWaitKey(delay);
+    }
+    int pollKey() CV_OVERRIDE
+    {
+        return pollKey_W32();
+    }
+};  // Win32BackendUI
 
-    if( window_handle == 0 )
-        CV_ERROR( CV_StsNullPtr, "NULL window" );
+static
+std::shared_ptr<Win32BackendUI>& getInstance()
+{
+    static std::shared_ptr<Win32BackendUI> g_instance = std::make_shared<Win32BackendUI>();
+    return g_instance;
+}
 
-    window = icvWindowByHWND( (HWND)window_handle );
-    if( window )
-        window_name = window->name;
+} // namespace impl
 
-    __END__;
+#ifndef BUILD_PLUGIN
+namespace highgui_backend {
 
-    return window_name;
+std::shared_ptr<UIBackend> createUIBackendWin32UI()
+{
+    return impl::getInstance();
 }
 
+}  // namespace highgui_backend
+#endif
 
-CV_IMPL void
-cvSetPreprocessFuncWin32_(const void* callback)
+}  // namespace
+
+
+#ifdef BUILD_PLUGIN
+
+#define ABI_VERSION 0
+#define API_VERSION 0
+#include "plugin_api.hpp"
+
+static
+CvResult cv_getInstance(CV_OUT CvPluginUIBackend* handle) CV_NOEXCEPT
 {
-    hg_on_preprocess = (CvWin32WindowCallback)callback;
+    try
+    {
+        if (!handle)
+            return CV_ERROR_FAIL;
+        *handle = cv::impl::getInstance().get();
+        return CV_ERROR_OK;
+    }
+    catch (...)
+    {
+        return CV_ERROR_FAIL;
+    }
 }
 
-CV_IMPL void
-cvSetPostprocessFuncWin32_(const void* callback)
+static const OpenCV_UI_Plugin_API plugin_api =
 {
-    hg_on_postprocess = (CvWin32WindowCallback)callback;
+    {
+        sizeof(OpenCV_UI_Plugin_API), ABI_VERSION, API_VERSION,
+        CV_VERSION_MAJOR, CV_VERSION_MINOR, CV_VERSION_REVISION, CV_VERSION_STATUS,
+        "Win32 OpenCV UI plugin"
+    },
+    {
+        /*  1*/cv_getInstance
+    }
+};
+
+const OpenCV_UI_Plugin_API* CV_API_CALL opencv_ui_plugin_init_v0(int requested_abi_version, int requested_api_version, void* /*reserved=NULL*/) CV_NOEXCEPT
+{
+    if (requested_abi_version == ABI_VERSION && requested_api_version <= API_VERSION)
+        return &plugin_api;
+    return NULL;
 }
 
-#endif //_WIN32
+#endif  // BUILD_PLUGIN
+
+#endif  // HAVE_WIN32UI
diff --git a/modules/imgcodecs/misc/objc/ios/Mat+Converters.h b/modules/imgcodecs/misc/objc/ios/Mat+Converters.h
index a3ee005c18be..0f74bb2f5dc7 100644
--- a/modules/imgcodecs/misc/objc/ios/Mat+Converters.h
+++ b/modules/imgcodecs/misc/objc/ios/Mat+Converters.h
@@ -1,5 +1,5 @@
 //
-//  Mat+UIImage.h
+//  Mat+Converters.h
 //
 //  Created by Giles Payne on 2020/03/03.
 //
diff --git a/modules/imgcodecs/misc/objc/ios/Mat+Converters.mm b/modules/imgcodecs/misc/objc/ios/Mat+Converters.mm
index 69250eb99415..79358cb6de7f 100644
--- a/modules/imgcodecs/misc/objc/ios/Mat+Converters.mm
+++ b/modules/imgcodecs/misc/objc/ios/Mat+Converters.mm
@@ -1,5 +1,5 @@
 //
-//  Mat+UIImage.mm
+//  Mat+Converters.mm
 //
 //  Created by Giles Payne on 2020/03/03.
 //
diff --git a/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.h b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.h
new file mode 100644
index 000000000000..341172798ed4
--- /dev/null
+++ b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.h
@@ -0,0 +1,27 @@
+//
+//  Mat+QuickLook.h
+//
+//  Created by Giles Payne on 2021/07/18.
+//
+
+#pragma once
+
+#ifdef __cplusplus
+#import "opencv2/core.hpp"
+#else
+#define CV_EXPORTS
+#endif
+
+#import "Mat.h"
+#import <Foundation/Foundation.h>
+#import <UIKit/UIKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+CV_EXPORTS @interface Mat (QuickLook)
+
+- (id)debugQuickLookObject;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm
new file mode 100644
index 000000000000..7bfee07eb131
--- /dev/null
+++ b/modules/imgcodecs/misc/objc/ios/Mat+QuickLook.mm
@@ -0,0 +1,155 @@
+//
+//  Mat+QuickLook.mm
+//
+//  Created by Giles Payne on 2021/07/18.
+//
+
+#import "Mat+QuickLook.h"
+#import "Mat+Converters.h"
+#import "Rect2i.h"
+#import "Core.h"
+#import "Imgproc.h"
+#import <opencv2/imgcodecs/ios.h>
+
+#define SIZE 20
+
+static UIFont* getCMU() {
+    return [UIFont fontWithName:@"CMU Serif" size:SIZE];
+}
+
+static UIFont* getBodoni72() {
+    return [UIFont fontWithName:@"Bodoni 72" size:SIZE];
+}
+
+static UIFont* getAnySerif() {
+    if (@available(iOS 13.0, *)) {
+        return [UIFont fontWithDescriptor:[[UIFontDescriptor preferredFontDescriptorWithTextStyle:UIFontTextStyleBody] fontDescriptorWithDesign:UIFontDescriptorSystemDesignSerif] size:SIZE];
+    } else {
+        return nil;
+    }
+}
+
+static UIFont* getSystemFont() {
+    return [UIFont systemFontOfSize:SIZE];
+}
+
+typedef UIFont* (*FontGetter)();
+
+@implementation Mat (QuickLook)
+
+- (NSString*)makeLabel:(BOOL)isIntType val:(NSNumber*)num {
+    if (isIntType) {
+        return [NSString stringWithFormat:@"%d", num.intValue];
+    } else {
+        int exponent = 1 + (int)log10(abs(num.doubleValue));
+        if (num.doubleValue == (double)num.intValue && num.doubleValue < 10000 && num.doubleValue > -10000) {
+            return [NSString stringWithFormat:@"%d", num.intValue];;
+        } else if (exponent <= 5 && exponent >= -1) {
+            return [NSString stringWithFormat:[NSString stringWithFormat:@"%%%d.%df", 6, MIN(5 - exponent, 4)], num.doubleValue];
+        } else {
+            return [[[NSString stringWithFormat:@"%.2e", num.doubleValue] stringByReplacingOccurrencesOfString:@"e+0" withString:@"e"] stringByReplacingOccurrencesOfString:@"e-0" withString:@"e-"];
+        }
+    }
+}
+
+- (void)relativeLine:(UIBezierPath*)path relX:(CGFloat)x relY:(CGFloat)y {
+    CGPoint curr = path.currentPoint;
+    [path addLineToPoint:CGPointMake(curr.x + x, curr.y + y)];
+}
+
+- (id)debugQuickLookObject {
+    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10) {
+        FontGetter fontGetters[] = { getCMU, getBodoni72, getAnySerif, getSystemFont };
+        UIFont* font = nil;
+        for (int fontGetterIndex = 0; font==nil && fontGetterIndex < (sizeof(fontGetters)) / (sizeof(fontGetters[0])); fontGetterIndex++) {
+            font = fontGetters[fontGetterIndex]();
+        }
+        int elements = [self rows] * [self cols];
+        NSDictionary<NSAttributedStringKey,id>* textFontAttributes = @{ NSFontAttributeName: font, NSForegroundColorAttributeName: UIColor.blackColor };
+        NSMutableArray<NSNumber*>* rawData = [NSMutableArray new];
+        for (int dataIndex = 0; dataIndex < elements; dataIndex++) {
+            [rawData addObject:[NSNumber numberWithDouble:0]];
+        }
+        [self get:0 col: 0 data: rawData];
+        BOOL isIntType = [self depth] <= CV_32S;
+        NSMutableArray<NSString*>* labels = [NSMutableArray new];
+        NSMutableDictionary<NSString*, NSValue*>* boundingRects = [NSMutableDictionary dictionaryWithCapacity:elements];
+        int maxWidth = 0, maxHeight = 0;
+        for (NSNumber* number in rawData) {
+            NSString* label = [self makeLabel:isIntType val:number];
+            [labels addObject:label];
+            CGRect boundingRect = [label boundingRectWithSize:CGSizeMake(CGFLOAT_MAX, CGFLOAT_MAX) options:NSStringDrawingUsesLineFragmentOrigin attributes:textFontAttributes context:nil];
+            if (boundingRect.size.width > maxWidth) {
+                maxWidth = boundingRect.size.width;
+            }
+            if (boundingRect.size.height > maxHeight) {
+                maxHeight = boundingRect.size.height;
+            }
+            boundingRects[label] = [NSValue valueWithCGRect:boundingRect];
+        }
+
+        int rowGap = 6;
+        int colGap = 6;
+        int borderGap = 8;
+        int lineThickness = 3;
+        int lipWidth = 6;
+        int imageWidth = 2 * (borderGap + lipWidth) + maxWidth * [self cols] + colGap * ([self cols] - 1);
+        int imageHeight = 2 * (borderGap + lipWidth) + maxHeight * [self rows] + rowGap * ([self rows] - 1);
+
+        UIBezierPath* leftBracket = [UIBezierPath new];
+        [leftBracket moveToPoint:CGPointMake(borderGap, borderGap)];
+        [self relativeLine:leftBracket relX:0 relY:imageHeight - 2 * borderGap];
+        [self relativeLine:leftBracket relX:lineThickness + lipWidth relY:0];
+        [self relativeLine:leftBracket relX:0 relY:-lineThickness];
+        [self relativeLine:leftBracket relX:-lipWidth relY:0];
+        [self relativeLine:leftBracket relX:0 relY:-(imageHeight - 2 * (borderGap + lineThickness))];
+        [self relativeLine:leftBracket relX:lipWidth relY:0];
+        [self relativeLine:leftBracket relX:0 relY:-lineThickness];
+        [leftBracket closePath];
+        CGAffineTransform reflect = CGAffineTransformConcat(CGAffineTransformMakeTranslation(-imageWidth, 0), CGAffineTransformMakeScale(-1, 1));
+        UIBezierPath* rightBracket = [leftBracket copy];
+        [rightBracket applyTransform:reflect];
+
+        CGRect rect = CGRectMake(0, 0, imageWidth, imageHeight);
+        UIGraphicsBeginImageContextWithOptions(rect.size, false, 0.0);
+        [UIColor.whiteColor setFill];
+        UIRectFill(rect);
+        [UIColor.blackColor setFill];
+        [leftBracket fill];
+        [rightBracket fill];
+        [labels enumerateObjectsUsingBlock:^(id label, NSUInteger index, BOOL *stop)
+        {
+            CGRect boundingRect = boundingRects[label].CGRectValue;
+            int row = (int)index / [self cols];
+            int col = (int)index % [self cols];
+            int x = borderGap + lipWidth + col * (maxWidth + colGap) + (maxWidth - boundingRect.size.width) / 2;
+            int y = borderGap + lipWidth + row * (maxHeight + rowGap) + (maxHeight - boundingRect.size.height) / 2;
+            CGRect textRect = CGRectMake(x, y, boundingRect.size.width, boundingRect.size.height);
+            [label drawInRect:textRect withAttributes:textFontAttributes];
+        }];
+        UIImage* image = UIGraphicsGetImageFromCurrentImageContext();
+        UIGraphicsEndImageContext();
+        return image;
+    } else if (([self dims] == 2) && ([self type] == CV_8U || [self type] == CV_8UC3 || [self type] == CV_8UC4)) {
+        return [self toUIImage];
+    } else if ([self dims] == 2 && [self channels] == 1) {
+        Mat* normalized = [Mat new];
+        [Core normalize:self dst:normalized alpha:0 beta:255 norm_type:NORM_MINMAX dtype:CV_8U];
+        Mat* normalizedKey = [[Mat alloc] initWithRows:[self rows] + 10 cols:[self cols] type:CV_8U];
+        std::vector<char> key;
+        for (int index = 0; index < [self cols]; index++) {
+            key.push_back((char)(index * 256 / [self cols]));
+        }
+        for (int index = 0; index < 10; index++) {
+            [normalizedKey put:@[[NSNumber numberWithInt:index], [NSNumber numberWithInt:0]] count:[self cols] byteBuffer:key.data()];
+        }
+        [normalized copyTo:[normalizedKey submatRoi:[[Rect2i alloc] initWithX:0 y:10 width:[self cols] height:[self rows]]]];
+        Mat* colorMap = [Mat new];
+        [Imgproc applyColorMap:normalizedKey dst:colorMap colormap:COLORMAP_JET];
+        [Imgproc cvtColor:colorMap dst:colorMap code:COLOR_BGR2RGB];
+        return [colorMap toUIImage];
+    }
+    return [self description];
+}
+
+@end
diff --git a/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.h b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.h
new file mode 100644
index 000000000000..9fa31aba399e
--- /dev/null
+++ b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.h
@@ -0,0 +1,27 @@
+//
+//  Mat+QuickLook.h
+//
+//  Created by Giles Payne on 2021/07/18.
+//
+
+#pragma once
+
+#ifdef __cplusplus
+#import "opencv2/core.hpp"
+#else
+#define CV_EXPORTS
+#endif
+
+#import "Mat.h"
+#import <Foundation/Foundation.h>
+#import <AppKit/AppKit.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+CV_EXPORTS @interface Mat (QuickLook)
+
+- (id)debugQuickLookObject;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm
new file mode 100644
index 000000000000..6775f817806c
--- /dev/null
+++ b/modules/imgcodecs/misc/objc/macosx/Mat+QuickLook.mm
@@ -0,0 +1,154 @@
+//
+//  Mat+QuickLook.mm
+//
+//  Created by Giles Payne on 2021/07/18.
+//
+
+#import "Mat+QuickLook.h"
+#import "Mat+Converters.h"
+#import "Rect2i.h"
+#import "Core.h"
+#import "Imgproc.h"
+#import <opencv2/imgcodecs/macosx.h>
+
+#define SIZE 20
+
+static NSFont* getCMU() {
+    return [NSFont fontWithName:@"CMU Serif" size:SIZE];
+}
+
+static NSFont* getBodoni72() {
+    return [NSFont fontWithName:@"Bodoni 72" size:SIZE];
+}
+
+static NSFont* getAnySerif() {
+    if (@available(macOS 11.0, *)) {
+        return [NSFont fontWithDescriptor:[[NSFontDescriptor preferredFontDescriptorForTextStyle:NSFontTextStyleBody options:@{}] fontDescriptorWithDesign:NSFontDescriptorSystemDesignSerif] size:SIZE];
+    } else {
+        return nil;
+    }
+}
+
+static NSFont* getSystemFont() {
+    return [NSFont systemFontOfSize:SIZE];
+}
+
+typedef NSFont* (*FontGetter)();
+
+@implementation Mat (QuickLook)
+
+- (NSString*)makeLabel:(BOOL)isIntType val:(NSNumber*)num {
+    if (isIntType) {
+        return [NSString stringWithFormat:@"%d", num.intValue];
+    } else {
+        int exponent = 1 + (int)log10(abs(num.doubleValue));
+        if (num.doubleValue == (double)num.intValue && num.doubleValue < 10000 && num.doubleValue > -10000) {
+            return [NSString stringWithFormat:@"%d", num.intValue];;
+        } else if (exponent <= 5 && exponent >= -1) {
+            return [NSString stringWithFormat:[NSString stringWithFormat:@"%%%d.%df", 6, MIN(5 - exponent, 4)], num.doubleValue];
+        } else {
+            return [[[NSString stringWithFormat:@"%.2e", num.doubleValue] stringByReplacingOccurrencesOfString:@"e+0" withString:@"e"] stringByReplacingOccurrencesOfString:@"e-0" withString:@"e-"];
+        }
+    }
+}
+
+- (id)debugQuickLookObject {
+    // for smallish Mat objects display as a matrix
+    if ([self dims] == 2 && [self rows] <= 10 && [self cols] <= 10) {
+        FontGetter fontGetters[] = { getCMU, getBodoni72, getAnySerif, getSystemFont };
+        NSFont* font = nil;
+        for (int fontGetterIndex = 0; font==nil && fontGetterIndex < (sizeof(fontGetters)) / (sizeof(fontGetters[0])); fontGetterIndex++) {
+            font = fontGetters[fontGetterIndex]();
+        }
+        int elements = [self rows] * [self cols];
+        NSDictionary<NSAttributedStringKey,id>* textFontAttributes = @{ NSFontAttributeName: font, NSForegroundColorAttributeName: NSColor.blackColor };
+        NSMutableArray<NSNumber*>* rawData = [NSMutableArray new];
+        for (int dataIndex = 0; dataIndex < elements; dataIndex++) {
+            [rawData addObject:[NSNumber numberWithDouble:0]];
+        }
+        [self get:0 col: 0 data: rawData];
+        BOOL isIntType = [self depth] <= CV_32S;
+        NSMutableArray<NSString*>* labels = [NSMutableArray new];
+        NSMutableDictionary<NSString*, NSValue*>* boundingRects = [NSMutableDictionary dictionaryWithCapacity:elements];
+        int maxWidth = 0, maxHeight = 0;
+        for (NSNumber* number in rawData) {
+            NSString* label = [self makeLabel:isIntType val:number];
+            [labels addObject:label];
+            NSRect boundingRect = [label boundingRectWithSize:NSMakeSize(CGFLOAT_MAX, CGFLOAT_MAX) options:NSStringDrawingUsesLineFragmentOrigin attributes:textFontAttributes];
+            if (boundingRect.size.width > maxWidth) {
+                maxWidth = boundingRect.size.width;
+            }
+            if (boundingRect.size.height > maxHeight) {
+                maxHeight = boundingRect.size.height;
+            }
+            boundingRects[label] = [NSValue valueWithRect:boundingRect];
+        }
+
+        int rowGap = 8;
+        int colGap = 8;
+        int borderGap = 9;
+        int lineThickness = 4;
+        int lipWidth = 8;
+        int imageWidth = 2 * (borderGap + lipWidth) + maxWidth * [self cols] + colGap * ([self cols] - 1);
+        int imageHeight = 2 * (borderGap + lipWidth) + maxHeight * [self rows] + rowGap * ([self rows] - 1);
+        NSImage* image = [[NSImage alloc] initWithSize:NSMakeSize(imageWidth, imageHeight)];
+        NSBezierPath* leftBracket = [NSBezierPath new];
+        [leftBracket moveToPoint:NSMakePoint(borderGap, borderGap)];
+        [leftBracket relativeLineToPoint:NSMakePoint(0, imageHeight - 2 * borderGap)];
+        [leftBracket relativeLineToPoint:NSMakePoint(lineThickness + lipWidth, 0)];
+        [leftBracket relativeLineToPoint:NSMakePoint(0, -lineThickness)];
+        [leftBracket relativeLineToPoint:NSMakePoint(-lipWidth, 0)];
+        [leftBracket relativeLineToPoint:NSMakePoint(0, -(imageHeight - 2 * (borderGap + lineThickness)))];
+        [leftBracket relativeLineToPoint:NSMakePoint(lipWidth, 0)];
+        [leftBracket relativeLineToPoint:NSMakePoint(0, -lineThickness)];
+        [leftBracket relativeLineToPoint:NSMakePoint(-(lineThickness + lipWidth), 0)];
+        NSAffineTransform* reflect = [NSAffineTransform new];
+        [reflect scaleXBy:-1 yBy:1];
+        [reflect translateXBy:-imageWidth yBy:0];
+        NSBezierPath* rightBracket = [leftBracket copy];
+        [rightBracket transformUsingAffineTransform:reflect];
+
+        [image lockFocus];
+        [NSColor.whiteColor drawSwatchInRect:NSMakeRect(0, 0, imageWidth, imageHeight)];
+        [NSColor.blackColor set];
+        [leftBracket fill];
+        [rightBracket fill];
+
+        [labels enumerateObjectsUsingBlock:^(id label, NSUInteger index, BOOL *stop)
+        {
+            NSRect boundingRect = boundingRects[label].rectValue;
+            int row = [self rows] - 1 - ((int)index / [self cols]);
+            int col = (int)index % [self cols];
+            int x = borderGap + lipWidth + col * (maxWidth + colGap) + (maxWidth - boundingRect.size.width) / 2;
+            int y = borderGap + lipWidth + row * (maxHeight + rowGap) + (maxHeight - boundingRect.size.height) / 2;
+            NSRect textRect = NSMakeRect(x, y, boundingRect.size.width, boundingRect.size.height);
+            [label drawInRect:textRect withAttributes:textFontAttributes];
+        }];
+        [image unlockFocus];
+        return image;
+    } else if (([self dims] == 2) && ([self type] == CV_8U || [self type] == CV_8UC3 || [self type] == CV_8UC4)) {
+        // convert to NSImage if the Mats has 2 dimensions and a type and number of channels consistent with it being a image
+        return [self toNSImage];
+    } else if ([self dims] == 2 && [self channels] == 1) {
+        // for other Mats with 2 dimensions and one channel - generate heat map
+        Mat* normalized = [Mat new];
+        [Core normalize:self dst:normalized alpha:0 beta:255 norm_type:NORM_MINMAX dtype:CV_8U];
+        Mat* normalizedKey = [[Mat alloc] initWithRows:[self rows] + 10 cols:[self cols] type:CV_8U];
+        std::vector<char> key;
+        for (int index = 0; index < [self cols]; index++) {
+            key.push_back((char)(index * 256 / [self cols]));
+        }
+        for (int index = 0; index < 10; index++) {
+            [normalizedKey put:@[[NSNumber numberWithInt:index], [NSNumber numberWithInt:0]] count:[self cols] byteBuffer:key.data()];
+        }
+        [normalized copyTo:[normalizedKey submatRoi:[[Rect2i alloc] initWithX:0 y:10 width:[self cols] height:[self rows]]]];
+        Mat* colorMap = [Mat new];
+        [Imgproc applyColorMap:normalizedKey dst:colorMap colormap:COLORMAP_JET];
+        [Imgproc cvtColor:colorMap dst:colorMap code:COLOR_BGR2RGB];
+        return [colorMap toNSImage];
+    }
+    //everything just return the Mat description
+    return [self description];
+}
+
+@end
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index 88c960747c60..19f3b1a9bf2c 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -2303,7 +2303,7 @@ enlarge an image, it will generally look best with c#INTER_CUBIC (slow) or #INTE
 @param src input image.
 @param dst output image; it has the size dsize (when it is non-zero) or the size computed from
 src.size(), fx, and fy; the type of dst is the same as of src.
-@param dsize output image size; if it equals zero, it is computed as:
+@param dsize output image size; if it equals zero (`None` in Python), it is computed as:
  \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
  Either dsize or both fx and fy must be non-zero.
 @param fx scale factor along the horizontal axis; when it equals 0, it is computed as
@@ -3897,6 +3897,7 @@ hierarchy[i][0] , hierarchy[i][1] , hierarchy[i][2] , and hierarchy[i][3] are se
 in contours of the next and previous contours at the same hierarchical level, the first child
 contour and the parent contour, respectively. If for the contour i there are no next, previous,
 parent, or nested contours, the corresponding elements of hierarchy[i] will be negative.
+@note In Python, hierarchy is nested inside a top level array. Use hierarchy[0][i] to access hierarchical elements of i-th contour.
 @param mode Contour retrieval mode, see #RetrievalModes
 @param method Contour approximation method, see #ContourApproximationModes
 @param offset Optional offset by which every contour point is shifted. This is useful if the
diff --git a/modules/imgproc/src/intersection.cpp b/modules/imgproc/src/intersection.cpp
index 3f749896a42c..47d3f3f457b5 100644
--- a/modules/imgproc/src/intersection.cpp
+++ b/modules/imgproc/src/intersection.cpp
@@ -47,24 +47,16 @@
 namespace cv
 {
 
-int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& rect2, OutputArray intersectingRegion )
+static int _rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& rect2, std::vector<Point2f> &intersection )
 {
     CV_INSTRUMENT_REGION();
 
     // L2 metric
     const float samePointEps = std::max(1e-16f, 1e-6f * (float)std::max(rect1.size.area(), rect2.size.area()));
 
-    if (rect1.size.empty() || rect2.size.empty())
-    {
-        intersectingRegion.release();
-        return INTERSECT_NONE;
-    }
-
     Point2f vec1[4], vec2[4];
     Point2f pts1[4], pts2[4];
 
-    std::vector <Point2f> intersection; intersection.reserve(24);
-
     rect1.points(pts1);
     rect2.points(pts2);
 
@@ -92,8 +84,6 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
                 intersection[i] = pts1[i];
             }
 
-            Mat(intersection).copyTo(intersectingRegion);
-
             return INTERSECT_FULL;
         }
     }
@@ -300,7 +290,50 @@ int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& r
     }
 
     intersection.resize(N);
-    Mat(intersection).copyTo(intersectingRegion);
+
+    return ret;
+}
+
+int rotatedRectangleIntersection( const RotatedRect& rect1, const RotatedRect& rect2, OutputArray intersectingRegion )
+{
+    CV_INSTRUMENT_REGION();
+
+    if (rect1.size.empty() || rect2.size.empty())
+    {
+        intersectingRegion.release();
+        return INTERSECT_NONE;
+    }
+
+    // Shift rectangles closer to origin (0, 0) to improve the calculation of the intesection region
+    // To do that, the average center of the rectangles is moved to the origin
+    const Point2f averageCenter = (rect1.center + rect2.center) / 2.0f;
+
+    RotatedRect shiftedRect1(rect1);
+    RotatedRect shiftedRect2(rect2);
+
+    // Move rectangles closer to origin
+    shiftedRect1.center -= averageCenter;
+    shiftedRect2.center -= averageCenter;
+
+    std::vector <Point2f> intersection; intersection.reserve(24);
+
+    const int ret = _rotatedRectangleIntersection(shiftedRect1, shiftedRect2, intersection);
+
+    // If return is not None, the intersection Points are shifted back to the original position
+    // and copied to the interesectingRegion
+    if (ret != INTERSECT_NONE)
+    {
+        for (size_t i = 0; i < intersection.size(); ++i)
+        {
+            intersection[i] += averageCenter;
+        }
+
+        Mat(intersection).copyTo(intersectingRegion);
+    }
+    else
+    {
+        intersectingRegion.release();
+    }
 
     return ret;
 }
diff --git a/modules/imgproc/src/rotcalipers.cpp b/modules/imgproc/src/rotcalipers.cpp
index 527f71a2477f..e3d81c7e0c0f 100644
--- a/modules/imgproc/src/rotcalipers.cpp
+++ b/modules/imgproc/src/rotcalipers.cpp
@@ -88,6 +88,32 @@ enum { CALIPERS_MAXHEIGHT=0, CALIPERS_MINAREARECT=1, CALIPERS_MAXDIST=2 };
  //    Notes:
  //F*/
 
+static void rotate90CCW(const cv::Point2f& in, cv::Point2f &out)
+{
+    out.x = -in.y;
+    out.y = in.x;
+}
+
+static void rotate90CW(const cv::Point2f& in, cv::Point2f &out)
+{
+    out.x = in.y;
+    out.y = -in.x;
+}
+
+static void rotate180(const cv::Point2f& in, cv::Point2f &out)
+{
+    out.x = -in.x;
+    out.y = -in.y;
+}
+
+/* return true if first vector is to the right (clockwise) of the second */
+static bool firstVecIsRight(const cv::Point2f& vec1, const cv::Point2f &vec2)
+{
+    cv::Point2f tmp;
+    rotate90CW(vec1, tmp);
+    return tmp.x * vec2.x + tmp.y * vec2.y < 0;
+}
+
 /* we will use usual cartesian coordinates */
 static void rotatingCalipers( const Point2f* points, int n, int mode, float* out )
 {
@@ -100,6 +126,7 @@ static void rotatingCalipers( const Point2f* points, int n, int mode, float* out
     Point2f* vect = (Point2f*)(inv_vect_length + n);
     int left = 0, bottom = 0, right = 0, top = 0;
     int seq[4] = { -1, -1, -1, -1 };
+    Point2f rot_vect[4];
 
     /* rotating calipers sides will always have coordinates
      (a,b) (-b,a) (-a,-b) (b, -a)
@@ -179,32 +206,18 @@ static void rotatingCalipers( const Point2f* points, int n, int mode, float* out
     /* all of edges will be checked while rotating calipers by 90 degrees */
     for( k = 0; k < n; k++ )
     {
-        /* sinus of minimal angle */
-        /*float sinus;*/
-
-        /* compute cosine of angle between calipers side and polygon edge */
-        /* dp - dot product */
-        float dp[4] = {
-            +base_a * vect[seq[0]].x + base_b * vect[seq[0]].y,
-            -base_b * vect[seq[1]].x + base_a * vect[seq[1]].y,
-            -base_a * vect[seq[2]].x - base_b * vect[seq[2]].y,
-            +base_b * vect[seq[3]].x - base_a * vect[seq[3]].y,
-        };
-
-        float maxcos = dp[0] * inv_vect_length[seq[0]];
-
         /* number of calipers edges, that has minimal angle with edge */
         int main_element = 0;
 
-        /* choose minimal angle */
-        for ( i = 1; i < 4; ++i )
+        /* choose minimum angle between calipers side and polygon edge by dot product sign */
+        rot_vect[0] = vect[seq[0]];
+        rotate90CW(vect[seq[1]], rot_vect[1]);
+        rotate180(vect[seq[2]], rot_vect[2]);
+        rotate90CCW(vect[seq[3]], rot_vect[3]);
+        for (i = 1; i < 4; i++)
         {
-            float cosalpha = dp[i] * inv_vect_length[seq[i]];
-            if (cosalpha > maxcos)
-            {
+            if (firstVecIsRight(rot_vect[i], rot_vect[main_element]))
                 main_element = i;
-                maxcos = cosalpha;
-            }
         }
 
         /*rotate calipers*/
diff --git a/modules/imgproc/test/test_convhull.cpp b/modules/imgproc/test/test_convhull.cpp
index 9c3f060e6059..0f379cd02b0e 100644
--- a/modules/imgproc/test/test_convhull.cpp
+++ b/modules/imgproc/test/test_convhull.cpp
@@ -2384,5 +2384,78 @@ TEST(Imgproc_minAreaRect, reproducer_18157)
     EXPECT_TRUE(checkMinAreaRect(rr, contour)) << rr.center << " " << rr.size << " " << rr.angle;
 }
 
+TEST(Imgproc_minAreaRect, reproducer_19769_lightweight)
+{
+    const int N = 23;
+    float pts_[N][2] = {
+            {1325, 732}, {1248, 808}, {582, 1510}, {586, 1524},
+            {595, 1541}, {599, 1547}, {789, 1745}, {829, 1786},
+            {997, 1958}, {1116, 2074}, {1207, 2066}, {1216, 2058},
+            {1231, 2044}, {1265, 2011}, {2036, 1254}, {2100, 1191},
+            {2169, 1123}, {2315, 979}, {2395, 900}, {2438, 787},
+            {2434, 782}, {2416, 762}, {2266, 610}
+    };
+    Mat contour(N, 1, CV_32FC2, (void*)pts_);
+
+    RotatedRect rr = cv::minAreaRect(contour);
+
+    EXPECT_TRUE(checkMinAreaRect(rr, contour)) << rr.center << " " << rr.size << " " << rr.angle;
+}
+
+TEST(Imgproc_minAreaRect, reproducer_19769)
+{
+    const int N = 169;
+    float pts_[N][2] = {
+            {1854, 227}, {1850, 228}, {1847, 229}, {1835, 235},
+            {1832, 237}, {1829, 239}, {1825, 242}, {1818, 248},
+            {1807, 258}, {1759, 306}, {1712, 351}, {1708, 356},
+            {1658, 404}, {1655, 408}, {1602, 459}, {1599, 463},
+            {1542, 518}, {1477, 582}, {1402, 656}, {1325, 732},
+            {1248, 808}, {1161, 894}, {1157, 898}, {1155, 900},
+            {1068, 986}, {1060, 995}, {1058, 997}, {957, 1097},
+            {956, 1097}, {814, 1238}, {810, 1242}, {805, 1248},
+            {610, 1442}, {603, 1450}, {599, 1455}, {596, 1459},
+            {594, 1462}, {592, 1465}, {590, 1470}, {588, 1472},
+            {586, 1476}, {586, 1478}, {584, 1481}, {583, 1485},
+            {582, 1490}, {582, 1510}, {583, 1515}, {584, 1518},
+            {585, 1521}, {586, 1524}, {593, 1538}, {595, 1541},
+            {597, 1544}, {599, 1547}, {603, 1552}, {609, 1559},
+            {623, 1574}, {645, 1597}, {677, 1630}, {713, 1667},
+            {753, 1707}, {789, 1744}, {789, 1745}, {829, 1786},
+            {871, 1828}, {909, 1867}, {909, 1868}, {950, 1910},
+            {953, 1912}, {997, 1958}, {1047, 2009}, {1094, 2056},
+            {1105, 2066}, {1110, 2070}, {1113, 2072}, {1116, 2074},
+            {1119, 2076}, {1122, 2077}, {1124, 2079}, {1130, 2082},
+            {1133, 2083}, {1136, 2084}, {1139, 2085}, {1142, 2086},
+            {1148, 2087}, {1166, 2087}, {1170, 2086}, {1174, 2085},
+            {1177, 2084}, {1180, 2083}, {1188, 2079}, {1190, 2077},
+            {1193, 2076}, {1196, 2074}, {1199, 2072}, {1202, 2070},
+            {1207, 2066}, {1216, 2058}, {1231, 2044}, {1265, 2011},
+            {1314, 1962}, {1360, 1917}, {1361, 1917}, {1408, 1871},
+            {1457, 1822}, {1508, 1773}, {1512, 1768}, {1560, 1722},
+            {1617, 1665}, {1671, 1613}, {1730, 1554}, {1784, 1502},
+            {1786, 1500}, {1787, 1498}, {1846, 1440}, {1850, 1437},
+            {1908, 1380}, {1974, 1314}, {2034, 1256}, {2036, 1254},
+            {2100, 1191}, {2169, 1123}, {2242, 1051}, {2315, 979},
+            {2395, 900}, {2426, 869}, {2435, 859}, {2438, 855},
+            {2440, 852}, {2442, 849}, {2443, 846}, {2445, 844},
+            {2446, 842}, {2446, 840}, {2448, 837}, {2449, 834},
+            {2450, 829}, {2450, 814}, {2449, 809}, {2448, 806},
+            {2447, 803}, {2442, 793}, {2440, 790}, {2438, 787},
+            {2434, 782}, {2428, 775}, {2416, 762}, {2411, 758},
+            {2342, 688}, {2340, 686}, {2338, 684}, {2266, 610},
+            {2260, 605}, {2170, 513}, {2075, 417}, {2073, 415},
+            {2069, 412}, {1955, 297}, {1955, 296}, {1913, 254},
+            {1904, 246}, {1897, 240}, {1894, 238}, {1891, 236},
+            {1888, 234}, {1880, 230}, {1877, 229}, {1874, 228},
+            {1870, 227}
+    };
+    Mat contour(N, 1, CV_32FC2, (void*)pts_);
+
+    RotatedRect rr = cv::minAreaRect(contour);
+
+    EXPECT_TRUE(checkMinAreaRect(rr, contour)) << rr.center << " " << rr.size << " " << rr.angle;
+}
+
 }} // namespace
 /* End of file. */
diff --git a/modules/imgproc/test/test_intersection.cpp b/modules/imgproc/test/test_intersection.cpp
index 7527dd9a22cc..c455c439fce1 100644
--- a/modules/imgproc/test/test_intersection.cpp
+++ b/modules/imgproc/test/test_intersection.cpp
@@ -391,4 +391,21 @@ TEST(Imgproc_RotatedRectangleIntersection, regression_18520)
     }
 }
 
+TEST(Imgproc_RotatedRectangleIntersection, regression_19824)
+{
+    RotatedRect r1(
+        Point2f(246805.033f, 4002326.94f),
+        Size2f(26.40587f, 6.20026f),
+        -62.10156f);
+    RotatedRect r2(
+        Point2f(246805.122f, 4002326.59f),
+        Size2f(27.4821f, 8.5361f),
+        -56.33761f);
+
+    std::vector<Point2f> intersections;
+    int interType = cv::rotatedRectangleIntersection(r1, r2, intersections);
+    EXPECT_EQ(INTERSECT_PARTIAL, interType);
+    EXPECT_LE(intersections.size(), (size_t)7);
+}
+
 }} // namespace
diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py
index 6019ca340d25..c5b4f34a8f2b 100755
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@@ -258,6 +258,8 @@ def __init__(self, decl, namespaces=[]): # [ 'class/struct cname', ': base', [mo
         for m in decl[2]:
             if m.startswith("="):
                 self.jname = m[1:]
+            if m == '/Simple':
+                self.smart = False
 
         if self.classpath:
             prefix = self.classpath.replace('.', '_')
@@ -445,7 +447,7 @@ def __init__(self):
 
     def clear(self):
         self.namespaces = ["cv"]
-        classinfo_Mat = ClassInfo([ 'class cv.Mat', '', [], [] ], self.namespaces)
+        classinfo_Mat = ClassInfo([ 'class cv.Mat', '', ['/Simple'], [] ], self.namespaces)
         self.classes = { "Mat" : classinfo_Mat }
         self.module = ""
         self.Module = ""
@@ -466,10 +468,15 @@ def add_class(self, decl):
         if name in type_dict and not classinfo.base:
             logging.warning('duplicated: %s', classinfo)
             return
+        if self.isSmartClass(classinfo):
+            jni_name = "*((*(Ptr<"+classinfo.fullNameCPP()+">*)%(n)s_nativeObj).get())"
+        else:
+            jni_name = "(*("+classinfo.fullNameCPP()+"*)%(n)s_nativeObj)"
         type_dict.setdefault(name, {}).update(
             { "j_type" : classinfo.jname,
               "jn_type" : "long", "jn_args" : (("__int64", ".nativeObj"),),
-              "jni_name" : "(*("+classinfo.fullNameCPP()+"*)%(n)s_nativeObj)", "jni_type" : "jlong",
+              "jni_name" : jni_name,
+              "jni_type" : "jlong",
               "suffix" : "J",
               "j_import" : "org.opencv.%s.%s" % (self.module, classinfo.jname)
             }
@@ -477,7 +484,8 @@ def add_class(self, decl):
         type_dict.setdefault(name+'*', {}).update(
             { "j_type" : classinfo.jname,
               "jn_type" : "long", "jn_args" : (("__int64", ".nativeObj"),),
-              "jni_name" : "("+classinfo.fullNameCPP()+"*)%(n)s_nativeObj", "jni_type" : "jlong",
+              "jni_name" : "&("+jni_name+")",
+              "jni_type" : "jlong",
               "suffix" : "J",
               "j_import" : "org.opencv.%s.%s" % (self.module, classinfo.jname)
             }
@@ -966,7 +974,13 @@ def gen_func(self, ci, fi, prop_name=''):
                 ret = "return env->NewStringUTF(_retval_.c_str());"
                 default = 'return env->NewStringUTF("");'
             elif self.isWrapped(fi.ctype): # wrapped class:
-                ret = "return (jlong) new %s(_retval_);" % self.fullTypeNameCPP(fi.ctype)
+                ret = None
+                if fi.ctype in self.classes:
+                    ret_ci = self.classes[fi.ctype]
+                    if self.isSmartClass(ret_ci):
+                        ret = "return (jlong)(new Ptr<%(ctype)s>(new %(ctype)s(_retval_)));" % { 'ctype': ret_ci.fullNameCPP() }
+                if ret is None:
+                    ret = "return (jlong) new %s(_retval_);" % self.fullTypeNameCPP(fi.ctype)
             elif fi.ctype.startswith('Ptr_'):
                 c_prologue.append("typedef Ptr<%s> %s;" % (self.fullTypeNameCPP(fi.ctype[4:]), fi.ctype))
                 ret = "return (jlong)(new %(ctype)s(_retval_));" % { 'ctype':fi.ctype }
@@ -1207,17 +1221,7 @@ def isSmartClass(self, ci):
         if ci.smart != None:
             return ci.smart
 
-        # if parents are smart (we hope) then children are!
-        # if not we believe the class is smart if it has "create" method
-        ci.smart = False
-        if ci.base or ci.name == 'Algorithm':
-            ci.smart = True
-        else:
-            for fi in ci.methods:
-                if fi.name == "create":
-                    ci.smart = True
-                    break
-
+        ci.smart = True  # smart class is not properly handled in case of base/derived classes
         return ci.smart
 
     def smartWrap(self, ci, fullname):
diff --git a/modules/js/src/make_umd.py b/modules/js/src/make_umd.py
index bed6ee9bcc0f..1096a8eb31b0 100644
--- a/modules/js/src/make_umd.py
+++ b/modules/js/src/make_umd.py
@@ -95,7 +95,7 @@ def make_umd(opencvjs, cvjs):
     root.cv = factory();
   } else if (typeof importScripts === 'function') {
     // Web worker
-    root.cv = factory;
+    root.cv = factory();
   } else {
     // Other shells, e.g. d8
     root.cv = factory();
diff --git a/modules/ml/misc/java/test/MLTest.java b/modules/ml/misc/java/test/MLTest.java
index 2b08543a843b..504805dffa97 100644
--- a/modules/ml/misc/java/test/MLTest.java
+++ b/modules/ml/misc/java/test/MLTest.java
@@ -36,7 +36,7 @@ public void testSaveLoad() {
         String filename = OpenCVTestRunner.getTempFileName("yml");
         saved.save(filename);
         SVM loaded = SVM.load(filename);
-        assertTrue(saved.isTrained());
+        assertTrue(loaded.isTrained());
     }
 
 }
diff --git a/modules/python/common.cmake b/modules/python/common.cmake
index 0ea4ee7b27a9..cedf07143488 100644
--- a/modules/python/common.cmake
+++ b/modules/python/common.cmake
@@ -86,7 +86,7 @@ set_target_properties(${the_module} PROPERTIES
                       ARCHIVE_OUTPUT_NAME ${the_module}  # prevent name conflict for python2/3 outputs
                       PREFIX ""
                       OUTPUT_NAME cv2
-                      SUFFIX ${CVPY_SUFFIX})
+                      SUFFIX "${CVPY_SUFFIX}")
 
 if(ENABLE_SOLUTION_FOLDERS)
   set_target_properties(${the_module} PROPERTIES FOLDER "bindings")
diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp
index 9e8a6ee13bd9..55f512291b50 100644
--- a/modules/python/src2/cv2.cpp
+++ b/modules/python/src2/cv2.cpp
@@ -2081,15 +2081,23 @@ static void OnChange(int pos, void *param)
 }
 
 #ifdef HAVE_OPENCV_HIGHGUI
+// workaround for #20408, use nullptr, set value later
+static int _createTrackbar(const String &trackbar_name, const String &window_name, int value, int count,
+                    TrackbarCallback onChange, PyObject* py_callback_info)
+{
+    int n = createTrackbar(trackbar_name, window_name, NULL, count, onChange, py_callback_info);
+    setTrackbarPos(trackbar_name, window_name, value);
+    return n;
+}
 static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
 {
     PyObject *on_change;
     char* trackbar_name;
     char* window_name;
-    int *value = new int;
+    int value;
     int count;
 
-    if (!PyArg_ParseTuple(args, "ssiiO", &trackbar_name, &window_name, value, &count, &on_change))
+    if (!PyArg_ParseTuple(args, "ssiiO", &trackbar_name, &window_name, &value, &count, &on_change))
         return NULL;
     if (!PyCallable_Check(on_change)) {
         PyErr_SetString(PyExc_TypeError, "on_change must be callable");
@@ -2108,7 +2116,7 @@ static PyObject *pycvCreateTrackbar(PyObject*, PyObject *args)
     {
         registered_callbacks.insert(std::pair<std::string, PyObject*>(name, py_callback_info));
     }
-    ERRWRAP2(createTrackbar(trackbar_name, window_name, value, count, OnChange, py_callback_info));
+    ERRWRAP2(_createTrackbar(trackbar_name, window_name, value, count, OnChange, py_callback_info));
     Py_RETURN_NONE;
 }
 
@@ -2219,12 +2227,6 @@ static PyMethodDef special_methods[] = {
 #ifdef HAVE_OPENCV_DNN
   {"dnn_registerLayer", CV_PY_FN_WITH_KW(pyopencv_cv_dnn_registerLayer), "registerLayer(type, class) -> None"},
   {"dnn_unregisterLayer", CV_PY_FN_WITH_KW(pyopencv_cv_dnn_unregisterLayer), "unregisterLayer(type) -> None"},
-#endif
-#ifdef HAVE_OPENCV_GAPI
-  {"GIn", CV_PY_FN_WITH_KW(pyopencv_cv_GIn), "GIn(...) -> GInputProtoArgs"},
-  {"GOut", CV_PY_FN_WITH_KW(pyopencv_cv_GOut), "GOut(...) -> GOutputProtoArgs"},
-  {"gin", CV_PY_FN_WITH_KW(pyopencv_cv_gin), "gin(...) -> ExtractArgsCallback"},
-  {"descr_of", CV_PY_FN_WITH_KW(pyopencv_cv_descr_of), "descr_of(...) -> ExtractMetaCallback"},
 #endif
   {NULL, NULL},
 };
diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py
index bb2acb267466..fbdf5677c4b7 100755
--- a/modules/python/src2/gen2.py
+++ b/modules/python/src2/gen2.py
@@ -214,6 +214,16 @@ class FormatStrings:
     "Stream": ArgTypeInfo("Stream", FormatStrings.object, 'Stream::Null()', True),
 }
 
+# Set of reserved keywords for Python. Can be acquired via the following call
+# $ python -c "help('keywords')"
+# Keywords that are reserved in C/C++ are excluded because they can not be
+# used as variables identifiers
+python_reserved_keywords = {
+    "True", "None", "False", "as", "assert", "def", "del", "elif", "except", "exec",
+    "finally", "from", "global",  "import", "in", "is", "lambda", "nonlocal",
+    "pass", "print", "raise", "with", "yield"
+}
+
 
 def normalize_class_name(name):
     return re.sub(r"^cv\.", "", name).replace(".", "_")
@@ -371,6 +381,8 @@ class ArgInfo(object):
     def __init__(self, arg_tuple):
         self.tp = handle_ptr(arg_tuple[0])
         self.name = arg_tuple[1]
+        if self.name in python_reserved_keywords:
+            self.name += "_"
         self.defval = arg_tuple[2]
         self.isarray = False
         self.arraylen = 0
diff --git a/modules/python/src2/hdr_parser.py b/modules/python/src2/hdr_parser.py
index 412d41a4df3f..3a09074ee768 100755
--- a/modules/python/src2/hdr_parser.py
+++ b/modules/python/src2/hdr_parser.py
@@ -832,6 +832,7 @@ def parse(self, hname, wmode=True):
                     ("GAPI_EXPORTS_W_SIMPLE","CV_EXPORTS_W_SIMPLE"),
                     ("GAPI_WRAP", "CV_WRAP"),
                     ("GAPI_PROP", "CV_PROP"),
+                    ("GAPI_PROP_RW", "CV_PROP_RW"),
                     ('defined(GAPI_STANDALONE)', '0'),
                 ])
 
@@ -978,7 +979,8 @@ def parse(self, hname, wmode=True):
                                 has_mat = len(list(filter(lambda x: x[0] in {"Mat", "vector_Mat"}, args))) > 0
                                 if has_mat:
                                     _, _, _, gpumat_decl = self.parse_stmt(stmt, token, mat="cuda::GpuMat", docstring=docstring)
-                                    decls.append(gpumat_decl)
+                                    if gpumat_decl != decl:
+                                        decls.append(gpumat_decl)
 
                             if self._generate_umat_decls:
                                 # If function takes as one of arguments Mat or vector<Mat> - we want to create the
@@ -987,7 +989,8 @@ def parse(self, hname, wmode=True):
                                 has_mat = len(list(filter(lambda x: x[0] in {"Mat", "vector_Mat"}, args))) > 0
                                 if has_mat:
                                     _, _, _, umat_decl = self.parse_stmt(stmt, token, mat="UMat", docstring=docstring)
-                                    decls.append(umat_decl)
+                                    if umat_decl != decl:
+                                        decls.append(umat_decl)
 
                         docstring = ""
                     if stmt_type == "namespace":
diff --git a/modules/python/test/test_filestorage_io.py b/modules/python/test/test_filestorage_io.py
index 62b540d79cd8..01e0a72300cc 100755
--- a/modules/python/test/test_filestorage_io.py
+++ b/modules/python/test/test_filestorage_io.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 """Algorithm serialization test."""
 from __future__ import print_function
+import base64
+import json
 import tempfile
 import os
 import cv2 as cv
@@ -109,5 +111,96 @@ def test_yml(self):
     def test_json(self):
         self.run_fs_test(".json")
 
+    def test_base64(self):
+        fd, fname = tempfile.mkstemp(prefix="opencv_python_sample_filestorage_base64", suffix=".json")
+        os.close(fd)
+        np.random.seed(42)
+        self.write_base64_json(fname)
+        os.remove(fname)
+
+    @staticmethod
+    def get_normal_2d_mat():
+        rows = 10
+        cols = 20
+        cn = 3
+
+        image = np.zeros((rows, cols, cn), np.uint8)
+        image[:] = (1, 2, 127)
+
+        for i in range(rows):
+            for j in range(cols):
+                image[i, j, 1] = (i + j) % 256
+
+        return image
+
+    @staticmethod
+    def get_normal_nd_mat():
+        shape = (2, 2, 1, 2)
+        cn = 4
+
+        image = np.zeros(shape + (cn,), np.float64)
+        image[:] = (0.888, 0.111, 0.666, 0.444)
+
+        return image
+
+    @staticmethod
+    def get_empty_2d_mat():
+        shape = (0, 0)
+        cn = 1
+
+        image = np.zeros(shape + (cn,), np.uint8)
+
+        return image
+
+    @staticmethod
+    def get_random_mat():
+        rows = 8
+        cols = 16
+        cn = 1
+
+        image = np.random.rand(rows, cols, cn)
+
+        return image
+
+    @staticmethod
+    def decode(data):
+        # strip $base64$
+        encoded = data[8:]
+
+        if len(encoded) == 0:
+            return b''
+
+        # strip info about datatype and padding
+        return base64.b64decode(encoded)[24:]
+
+    def write_base64_json(self, fname):
+        fs = cv.FileStorage(fname, cv.FileStorage_WRITE_BASE64)
+
+        mats = {'normal_2d_mat': self.get_normal_2d_mat(),
+                'normal_nd_mat': self.get_normal_nd_mat(),
+                'empty_2d_mat': self.get_empty_2d_mat(),
+                'random_mat': self.get_random_mat()}
+
+        for name, mat in mats.items():
+            fs.write(name, mat)
+
+        fs.release()
+
+        data = {}
+        with open(fname) as file:
+            data = json.load(file)
+
+        for name, mat in mats.items():
+            buffer = b''
+
+            if mat.size != 0:
+                if hasattr(mat, 'tobytes'):
+                    buffer = mat.tobytes()
+                else:
+                    buffer = mat.tostring()
+
+            self.assertEqual(buffer, self.decode(data[name]['data']))
+
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/python/test/test_misc.py b/modules/python/test/test_misc.py
index 4c98b928338e..d3a9f357661e 100644
--- a/modules/python/test/test_misc.py
+++ b/modules/python/test/test_misc.py
@@ -464,6 +464,23 @@ def test_parse_to_range_not_convertible(self):
             with self.assertRaises((TypeError), msg=get_no_exception_msg(not_convertible)):
                 _ = cv.utils.dumpRange(not_convertible)
 
+    def test_reserved_keywords_are_transformed(self):
+        default_lambda_value = 2
+        default_from_value = 3
+        format_str = "arg={}, lambda={}, from={}"
+        self.assertEqual(
+            cv.utils.testReservedKeywordConversion(20), format_str.format(20, default_lambda_value, default_from_value)
+        )
+        self.assertEqual(
+            cv.utils.testReservedKeywordConversion(10, lambda_=10), format_str.format(10, 10, default_from_value)
+        )
+        self.assertEqual(
+            cv.utils.testReservedKeywordConversion(10, from_=10), format_str.format(10, default_lambda_value, 10)
+        )
+        self.assertEqual(
+            cv.utils.testReservedKeywordConversion(20, lambda_=-4, from_=12), format_str.format(20, -4, 12)
+        )
+
 
 class SamplesFindFile(NewOpenCVTests):
 
diff --git a/modules/ts/CMakeLists.txt b/modules/ts/CMakeLists.txt
index f95bed079383..c1d249ea149a 100644
--- a/modules/ts/CMakeLists.txt
+++ b/modules/ts/CMakeLists.txt
@@ -41,3 +41,9 @@ endif()
 if(NOT OPENCV_TESTS_CONFIG_STR STREQUAL "${__content}")
   file(WRITE "${OPENCV_TESTS_CONFIG_FILE}" "${OPENCV_TESTS_CONFIG_STR}")
 endif()
+
+if(OPENCV_DISABLE_THREAD_SUPPORT)
+  # This is required to disable threads in the ts module, as
+  # described in `ts_gtest.h`.
+  ocv_target_compile_definitions(${the_module} PUBLIC GTEST_HAS_PTHREAD=0)
+endif()
diff --git a/modules/ts/src/ocl_perf.cpp b/modules/ts/src/ocl_perf.cpp
index 8dacf219f64b..fe521f2c00d9 100644
--- a/modules/ts/src/ocl_perf.cpp
+++ b/modules/ts/src/ocl_perf.cpp
@@ -70,7 +70,7 @@ void randu(InputOutputArray dst)
         cv::randu(dst, -128, 128);
     else if (dst.depth() == CV_16U)
         cv::randu(dst, 0, 1024);
-    else if (dst.depth() == CV_32F || dst.depth() == CV_64F)
+    else if (dst.depth() == CV_32F || dst.depth() == CV_64F || dst.depth() == CV_16F)
         cv::randu(dst, -1.0, 1.0);
     else if (dst.depth() == CV_16S || dst.depth() == CV_32S)
         cv::randu(dst, -4096, 4096);
diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp
index 3aa403ad87e8..3af3a7b8d5af 100644
--- a/modules/ts/src/ts.cpp
+++ b/modules/ts/src/ts.cpp
@@ -72,7 +72,9 @@
 #if defined _WIN32 || defined WINCE
 # include <windows.h>
 #else
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 # include <dirent.h>
+#endif
 # include <sys/stat.h>
 #endif
 
diff --git a/modules/ts/src/ts_gtest.cpp b/modules/ts/src/ts_gtest.cpp
index a65ef721a2c6..b3debd54d2ed 100644
--- a/modules/ts/src/ts_gtest.cpp
+++ b/modules/ts/src/ts_gtest.cpp
@@ -1067,6 +1067,7 @@ class GTEST_API_ UnitTestImpl {
   void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
                    Test::TearDownTestCaseFunc tear_down_tc,
                    TestInfo* test_info) {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -1079,6 +1080,7 @@ class GTEST_API_ UnitTestImpl {
       GTEST_CHECK_(!original_working_dir_.IsEmpty())
           << "Failed to get the current working directory.";
     }
+#endif
 
     GetTestCase(test_info->test_case_name(),
                 test_info->type_param(),
@@ -9165,6 +9167,7 @@ static bool IsPathSeparator(char c) {
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
   // Windows CE doesn't have a current directory, so we just return
   // something reasonable.
@@ -9183,6 +9186,9 @@ FilePath FilePath::GetCurrentDir() {
 # endif  // GTEST_OS_NACL
   return FilePath(result == NULL ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+  return FilePath("");
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 // Returns a copy of the FilePath with the case-insensitive extension removed.
@@ -9391,6 +9397,7 @@ bool FilePath::CreateDirectoriesRecursively() const {
 // directory for any reason, including if the parent directory does not
 // exist. Not named "CreateDirectory" because that's a macro on Windows.
 bool FilePath::CreateFolder() const {
+#if OPENCV_HAVE_FILESYSTEM_SUPPORT
 #if GTEST_OS_WINDOWS_MOBILE
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
@@ -9406,6 +9413,9 @@ bool FilePath::CreateFolder() const {
     return this->DirectoryExists();  // An error is OK if the directory exists.
   }
   return true;  // No error.
+#else // OPENCV_HAVE_FILESYSTEM_SUPPORT
+  return false;
+#endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
 }
 
 // If input name has a trailing separator character, remove it and return the
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index 2a9169fd13a5..5a42ca01cdc4 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -1297,7 +1297,7 @@ void TestBase::warmup(cv::InputOutputArray a, WarmUpType wtype)
                 cv::randu(a, -128, 128);
             else if (depth == CV_16U)
                 cv::randu(a, 0, 1024);
-            else if (depth == CV_32F || depth == CV_64F)
+            else if (depth == CV_32F || depth == CV_64F || depth == CV_16F)
                 cv::randu(a, -1.0, 1.0);
             else if (depth == CV_16S || depth == CV_32S)
                 cv::randu(a, -4096, 4096);
diff --git a/modules/video/misc/java/test/TrackerCreateTest.java b/modules/video/misc/java/test/TrackerCreateTest.java
index dad696bebfa2..83bbd0b5d5ce 100644
--- a/modules/video/misc/java/test/TrackerCreateTest.java
+++ b/modules/video/misc/java/test/TrackerCreateTest.java
@@ -1,7 +1,10 @@
 package org.opencv.test.video;
 
 import org.opencv.core.Core;
+import org.opencv.core.CvType;
 import org.opencv.core.CvException;
+import org.opencv.core.Mat;
+import org.opencv.core.Rect;
 import org.opencv.test.OpenCVTestCase;
 
 import org.opencv.video.Tracker;
@@ -27,6 +30,10 @@ public void testCreateTrackerGOTURN() {
 
     public void testCreateTrackerMIL() {
         Tracker tracker = TrackerMIL.create();
+        assert(tracker != null);
+        Mat mat = new Mat(100, 100, CvType.CV_8UC1);
+        Rect rect = new Rect(10, 10, 30, 30);
+        tracker.init(mat, rect);  // should not crash (https://github.com/opencv/opencv/issues/19915)
     }
 
 }
diff --git a/modules/videoio/cmake/detect_android_camera.cmake b/modules/videoio/cmake/detect_android_camera.cmake
index ded4c91ccf17..a465751334fd 100644
--- a/modules/videoio/cmake/detect_android_camera.cmake
+++ b/modules/videoio/cmake/detect_android_camera.cmake
@@ -4,5 +4,3 @@ if(ANDROID AND ANDROID_NATIVE_API_LEVEL GREATER 23)
   set(libs "-landroid -llog -lcamera2ndk")
   ocv_add_external_target(android_native_camera "" "${libs}" "HAVE_ANDROID_NATIVE_CAMERA")
 endif()
-
-set(HAVE_ANDROID_NATIVE_CAMERA ${HAVE_ANDROID_NATIVE_CAMERA} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_android_mediandk.cmake b/modules/videoio/cmake/detect_android_mediandk.cmake
index edfb4bbbc5c3..cee64ab54991 100644
--- a/modules/videoio/cmake/detect_android_mediandk.cmake
+++ b/modules/videoio/cmake/detect_android_mediandk.cmake
@@ -4,5 +4,3 @@ if(ANDROID AND ANDROID_NATIVE_API_LEVEL GREATER 20)
   set(libs "-landroid -llog -lmediandk")
   ocv_add_external_target(android_mediandk "" "${libs}" "HAVE_ANDROID_MEDIANDK")
 endif()
-
-set(HAVE_ANDROID_MEDIANDK ${HAVE_ANDROID_MEDIANDK} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_aravis.cmake b/modules/videoio/cmake/detect_aravis.cmake
index 79d6a217db2e..e7b382899343 100644
--- a/modules/videoio/cmake/detect_aravis.cmake
+++ b/modules/videoio/cmake/detect_aravis.cmake
@@ -21,7 +21,7 @@ if(NOT HAVE_ARAVIS_API)
     string(REGEX REPLACE ".*ARAVIS_MAJOR_VERSION[^0-9]+([0-9]+).*" "\\1" ver_major "${ver_strings}")
     string(REGEX REPLACE ".*ARAVIS_MINOR_VERSION[^0-9]+([0-9]+).*" "\\1" ver_minor "${ver_strings}")
     string(REGEX REPLACE ".*ARAVIS_MICRO_VERSION[^0-9]+([0-9]+).*" "\\1" ver_micro "${ver_strings}")
-    set(ARAVIS_VERSION "${ver_major}.${ver_minor}.${ver_micro}" PARENT_SCOPE) # informational
+    set(ARAVIS_VERSION "${ver_major}.${ver_minor}.${ver_micro}")  # informational
     set(ARAVIS_INCLUDE_DIRS "${ARAVIS_INCLUDE}")
     set(ARAVIS_LIBRARIES "${ARAVIS_LIBRARY}")
   endif()
@@ -30,5 +30,3 @@ endif()
 if(HAVE_ARAVIS_API)
   ocv_add_external_target(aravis "${ARAVIS_INCLUDE_DIRS}" "${ARAVIS_LIBRARIES}" "HAVE_ARAVIS_API")
 endif()
-
-set(HAVE_ARAVIS_API ${HAVE_ARAVIS_API} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_avfoundation.cmake b/modules/videoio/cmake/detect_avfoundation.cmake
index a341f587a199..2da4fabfab44 100644
--- a/modules/videoio/cmake/detect_avfoundation.cmake
+++ b/modules/videoio/cmake/detect_avfoundation.cmake
@@ -14,5 +14,3 @@ if(APPLE)
   endif()
   ocv_add_external_target(avfoundation "" "${libs}" "HAVE_AVFOUNDATION")
 endif()
-
-set(HAVE_AVFOUNDATION ${HAVE_AVFOUNDATION} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_dc1394.cmake b/modules/videoio/cmake/detect_dc1394.cmake
index 51ab2dd80eb4..8bcee4bf7098 100644
--- a/modules/videoio/cmake/detect_dc1394.cmake
+++ b/modules/videoio/cmake/detect_dc1394.cmake
@@ -2,7 +2,6 @@
 if(NOT HAVE_DC1394_2 AND PKG_CONFIG_FOUND)
   ocv_check_modules(DC1394_2 libdc1394-2)
   if(DC1394_2_FOUND)
-    set(DC1394_2_VERSION "${DC1394_2_VERSION}" PARENT_SCOPE) # informational
     set(HAVE_DC1394_2 TRUE)
   endif()
 endif()
@@ -20,12 +19,10 @@ if(NOT HAVE_DC1394_2)
     set(HAVE_DC1394_2 TRUE)
     set(DC1394_2_INCLUDE_DIRS "${DC1394_INCLUDE}")
     set(DC1394_2_LIBRARIES "${DC1394_LIBRARY}")
-    set(DC1394_2_VERSION "unknown" PARENT_SCOPE) # informational
+    set(DC1394_2_VERSION "unknown") # informational
   endif()
 endif()
 
 if(HAVE_DC1394_2)
   ocv_add_external_target(dc1394_2 "${DC1394_2_INCLUDE_DIRS}" "${DC1394_2_LIBRARIES}" "HAVE_DC1394_2")
 endif()
-
-set(HAVE_DC1394_2 ${HAVE_DC1394_2} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_dshow.cmake b/modules/videoio/cmake/detect_dshow.cmake
index 3f41b3fd34e2..928134c08c54 100644
--- a/modules/videoio/cmake/detect_dshow.cmake
+++ b/modules/videoio/cmake/detect_dshow.cmake
@@ -10,5 +10,3 @@ endif()
 if(HAVE_DSHOW)
   ocv_add_external_target(dshow "" "" "HAVE_DSHOW")
 endif()
-
-set(HAVE_DSHOW ${HAVE_DSHOW} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_ffmpeg.cmake b/modules/videoio/cmake/detect_ffmpeg.cmake
index 58de4b9515ac..c33eaf221b8a 100644
--- a/modules/videoio/cmake/detect_ffmpeg.cmake
+++ b/modules/videoio/cmake/detect_ffmpeg.cmake
@@ -14,11 +14,6 @@ if(NOT HAVE_FFMPEG AND WIN32 AND NOT ARM AND NOT OPENCV_FFMPEG_SKIP_DOWNLOAD)
   download_win_ffmpeg(FFMPEG_CMAKE_SCRIPT)
   if(FFMPEG_CMAKE_SCRIPT)
     include("${FFMPEG_CMAKE_SCRIPT}")
-    set(FFMPEG_libavcodec_VERSION ${FFMPEG_libavcodec_VERSION} PARENT_SCOPE) # info
-    set(FFMPEG_libavformat_VERSION ${FFMPEG_libavformat_VERSION} PARENT_SCOPE) # info
-    set(FFMPEG_libavutil_VERSION ${FFMPEG_libavutil_VERSION} PARENT_SCOPE) # info
-    set(FFMPEG_libswscale_VERSION ${FFMPEG_libswscale_VERSION} PARENT_SCOPE) # info
-    set(FFMPEG_libavresample_VERSION ${FFMPEG_libavresample_VERSION} PARENT_SCOPE) # info
     set(HAVE_FFMPEG TRUE)
     set(HAVE_FFMPEG_WRAPPER TRUE)
   endif()
@@ -132,5 +127,3 @@ elseif(HAVE_FFMPEG)
     ocv_add_external_target(ffmpeg.plugin_deps "${__plugin_include_dirs}" "${__plugin_include_libs}" "${__plugin_defines}")
   endif()
 endif()
-
-set(HAVE_FFMPEG ${HAVE_FFMPEG} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_gphoto.cmake b/modules/videoio/cmake/detect_gphoto.cmake
index 0d6f1212eb37..2cb23c00335f 100644
--- a/modules/videoio/cmake/detect_gphoto.cmake
+++ b/modules/videoio/cmake/detect_gphoto.cmake
@@ -9,5 +9,3 @@ endif()
 if(HAVE_GPHOTO2)
   ocv_add_external_target(gphoto2 "${GPHOTO2_INCLUDE_DIRS}" "${GPHOTO2_LIBRARIES}" "HAVE_GPHOTO2")
 endif()
-
-set(HAVE_GPHOTO2 ${HAVE_GPHOTO2} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_gstreamer.cmake b/modules/videoio/cmake/detect_gstreamer.cmake
index 219878616175..47ea7a0b3071 100644
--- a/modules/videoio/cmake/detect_gstreamer.cmake
+++ b/modules/videoio/cmake/detect_gstreamer.cmake
@@ -69,7 +69,7 @@ if(NOT HAVE_GSTREAMER AND WIN32)
     string(REGEX REPLACE ".*GST_VERSION_MAJOR[^0-9]+([0-9]+).*" "\\1" ver_major "${ver_strings}")
     string(REGEX REPLACE ".*GST_VERSION_MINOR[^0-9]+([0-9]+).*" "\\1" ver_minor "${ver_strings}")
     string(REGEX REPLACE ".*GST_VERSION_MICRO[^0-9]+([0-9]+).*" "\\1" ver_micro "${ver_strings}")
-    set(GSTREAMER_VERSION "${ver_major}.${ver_minor}.${ver_micro}" PARENT_SCOPE) # informational
+    set(GSTREAMER_VERSION "${ver_major}.${ver_minor}.${ver_micro}")  # informational
     set(HAVE_GSTREAMER TRUE)
     set(GSTREAMER_LIBRARIES
       ${GSTREAMER_gstreamer_LIBRARY}
@@ -95,7 +95,7 @@ if(NOT HAVE_GSTREAMER AND PKG_CONFIG_FOUND)
   ocv_check_modules(GSTREAMER_video gstreamer-video-1.0)
   if(GSTREAMER_base_FOUND AND GSTREAMER_app_FOUND AND GSTREAMER_riff_FOUND AND GSTREAMER_pbutils_FOUND AND GSTREAMER_video_FOUND)
     set(HAVE_GSTREAMER TRUE)
-    set(GSTREAMER_VERSION ${GSTREAMER_base_VERSION} PARENT_SCOPE) # informational
+    set(GSTREAMER_VERSION ${GSTREAMER_base_VERSION})  # informational
     set(GSTREAMER_LIBRARIES ${GSTREAMER_base_LIBRARIES} ${GSTREAMER_app_LIBRARIES} ${GSTREAMER_riff_LIBRARIES} ${GSTREAMER_pbutils_LIBRARIES} ${GSTREAMER_video_LIBRARIES})
     set(GSTREAMER_INCLUDE_DIRS ${GSTREAMER_base_INCLUDE_DIRS} ${GSTREAMER_app_INCLUDE_DIRS} ${GSTREAMER_riff_INCLUDE_DIRS} ${GSTREAMER_pbutils_INCLUDE_DIRS} ${GSTREAMER_video_INCLUDE_DIRS})
   endif()
@@ -104,5 +104,3 @@ endif()
 if(HAVE_GSTREAMER)
   ocv_add_external_target(gstreamer "${GSTREAMER_INCLUDE_DIRS}" "${GSTREAMER_LIBRARIES}" "HAVE_GSTREAMER")
 endif()
-
-set(HAVE_GSTREAMER ${HAVE_GSTREAMER} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_ios.cmake b/modules/videoio/cmake/detect_ios.cmake
index c75426060b0b..8d48dd6f3bea 100644
--- a/modules/videoio/cmake/detect_ios.cmake
+++ b/modules/videoio/cmake/detect_ios.cmake
@@ -11,5 +11,3 @@ if(APPLE AND IOS)
     "-framework UIKit")
   ocv_add_external_target(cap_ios "" "${libs}" "HAVE_CAP_IOS")
 endif()
-
-set(HAVE_CAP_IOS ${HAVE_CAP_IOS} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_msdk.cmake b/modules/videoio/cmake/detect_msdk.cmake
index d035c3f5cc11..83701425e1f8 100644
--- a/modules/videoio/cmake/detect_msdk.cmake
+++ b/modules/videoio/cmake/detect_msdk.cmake
@@ -70,5 +70,3 @@ if(HAVE_MFX)
   list(APPEND MFX_DEFS "HAVE_MFX")
   ocv_add_external_target(mediasdk "${MFX_INCLUDE_DIRS}" "${MFX_LIBRARIES}" "${MFX_DEFS}")
 endif()
-
-set(HAVE_MFX ${HAVE_MFX} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_msmf.cmake b/modules/videoio/cmake/detect_msmf.cmake
index a1c91dab670a..aebc226bcfc9 100644
--- a/modules/videoio/cmake/detect_msmf.cmake
+++ b/modules/videoio/cmake/detect_msmf.cmake
@@ -20,6 +20,3 @@ if(HAVE_MSMF)
   endif()
   ocv_add_external_target(msmf "" "" "${defs}")
 endif()
-
-set(HAVE_MSMF ${HAVE_MSMF} PARENT_SCOPE)
-set(HAVE_MSMF_DXVA ${HAVE_MSMF_DXVA} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_openni2.cmake b/modules/videoio/cmake/detect_openni2.cmake
index 76c31454da81..54a5c62beddc 100644
--- a/modules/videoio/cmake/detect_openni2.cmake
+++ b/modules/videoio/cmake/detect_openni2.cmake
@@ -42,8 +42,6 @@ if(HAVE_OPENNI2)
   string(REGEX REPLACE ".*ONI_VERSION_MAJOR[^0-9]+([0-9]+).*" "\\1" ver_major "${ver_strings}")
   string(REGEX REPLACE ".*ONI_VERSION_MINOR[^0-9]+([0-9]+).*" "\\1" ver_minor "${ver_strings}")
   string(REGEX REPLACE ".*ONI_VERSION_MAINTENANCE[^0-9]+([0-9]+).*" "\\1" ver_maint "${ver_strings}")
-  set(OPENNI2_VERSION "${ver_major}.${ver_minor}.${ver_maint}" PARENT_SCOPE) # informational
+  set(OPENNI2_VERSION "${ver_major}.${ver_minor}.${ver_maint}")  # informational
   ocv_add_external_target(openni2 "${OPENNI2_INCLUDE_DIRS}" "${OPENNI2_LIBRARIES}" "HAVE_OPENNI2")
 endif()
-
-set(HAVE_OPENNI2 ${HAVE_OPENNI2} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_pvapi.cmake b/modules/videoio/cmake/detect_pvapi.cmake
index a0f4673fdc1d..f2c6d4bceaa5 100644
--- a/modules/videoio/cmake/detect_pvapi.cmake
+++ b/modules/videoio/cmake/detect_pvapi.cmake
@@ -19,5 +19,3 @@ endif()
 if(HAVE_PVAPI)
   ocv_add_external_target(pvapi "${PVAPI_INCLUDE}" "${PVAPI_LIBRARY}" "HAVE_PVAPI")
 endif()
-
-set(HAVE_PVAPI ${HAVE_PVAPI} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_realsense.cmake b/modules/videoio/cmake/detect_realsense.cmake
index 32e5e02c9e7b..065f5488301f 100644
--- a/modules/videoio/cmake/detect_realsense.cmake
+++ b/modules/videoio/cmake/detect_realsense.cmake
@@ -4,7 +4,7 @@ if(NOT HAVE_LIBREALSENSE)
   find_package(realsense2 QUIET)
   if(realsense2_FOUND)
     set(HAVE_LIBREALSENSE TRUE)
-    set(LIBREALSENSE_VERSION "${realsense2_VERSION}" PARENT_SCOPE) # informational
+    set(LIBREALSENSE_VERSION "${realsense2_VERSION}")  # informational
     ocv_add_external_target(librealsense "" "${realsense2_LIBRARY}" "HAVE_LIBREALSENSE")
   endif()
 endif()
@@ -20,7 +20,7 @@ if(NOT HAVE_LIBREALSENSE)
     string(REGEX REPLACE ".*RS2_API_MAJOR_VERSION[^0-9]+([0-9]+).*" "\\1" ver_major "${ver_strings}")
     string(REGEX REPLACE ".*RS2_API_MINOR_VERSION[^0-9]+([0-9]+).*" "\\1" ver_minor "${ver_strings}")
     string(REGEX REPLACE ".*RS2_API_PATCH_VERSION[^0-9]+([0-9]+).*" "\\1" ver_patch "${ver_strings}")
-    set(LIBREALSENSE_VERSION "${ver_major}.${ver_minor}.${ver_patch}" PARENT_SCOPE) # informational
+    set(LIBREALSENSE_VERSION "${ver_major}.${ver_minor}.${ver_patch}")  # informational
     ocv_add_external_target(librealsense "${LIBREALSENSE_INCLUDE_DIR}" "${LIBREALSENSE_LIBRARIES}" "HAVE_LIBREALSENSE")
   endif()
 endif()
diff --git a/modules/videoio/cmake/detect_ueye.cmake b/modules/videoio/cmake/detect_ueye.cmake
index 495e9c245023..9428f9e59647 100644
--- a/modules/videoio/cmake/detect_ueye.cmake
+++ b/modules/videoio/cmake/detect_ueye.cmake
@@ -21,5 +21,3 @@ unset(_WIN_LIB_SUFFIX)
 if(HAVE_UEYE)
   ocv_add_external_target(ueye "${UEYE_INCLUDE}" "${UEYE_LIBRARY}" "HAVE_UEYE")
 endif()
-
-set(HAVE_UEYE ${HAVE_UEYE} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_v4l.cmake b/modules/videoio/cmake/detect_v4l.cmake
index 05b73b003c4f..e413dae9ca4e 100644
--- a/modules/videoio/cmake/detect_v4l.cmake
+++ b/modules/videoio/cmake/detect_v4l.cmake
@@ -15,5 +15,3 @@ if(NOT HAVE_V4L)
     ocv_add_external_target(v4l "" "" "${defs}")
   endif()
 endif()
-
-set(HAVE_V4L ${HAVE_V4L} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_ximea.cmake b/modules/videoio/cmake/detect_ximea.cmake
index 9cf295e3529b..7521e619b036 100644
--- a/modules/videoio/cmake/detect_ximea.cmake
+++ b/modules/videoio/cmake/detect_ximea.cmake
@@ -28,5 +28,3 @@ endif()
 if(HAVE_XIMEA)
   ocv_add_external_target(ximea "${XIMEA_INCLUDE}" "${XIMEA_LIBRARY}" "HAVE_XIMEA")
 endif()
-
-set(HAVE_XIMEA ${HAVE_XIMEA} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/detect_xine.cmake b/modules/videoio/cmake/detect_xine.cmake
index 3e1f3010a431..0a6f64235349 100644
--- a/modules/videoio/cmake/detect_xine.cmake
+++ b/modules/videoio/cmake/detect_xine.cmake
@@ -5,5 +5,3 @@ endif()
 if(HAVE_XINE)
   ocv_add_external_target(xine "${XINE_INCLUDE_DIRS}" "${XINE_LIBRARIES}" "HAVE_XINE")
 endif()
-
-set(HAVE_XINE ${HAVE_XINE} PARENT_SCOPE)
diff --git a/modules/videoio/cmake/init.cmake b/modules/videoio/cmake/init.cmake
index 310df2d249e3..af664f94df37 100644
--- a/modules/videoio/cmake/init.cmake
+++ b/modules/videoio/cmake/init.cmake
@@ -1,20 +1,13 @@
-include(FindPkgConfig)
-
-# FIXIT: stop using PARENT_SCOPE in dependencies
-if(PROJECT_NAME STREQUAL "OpenCV")
-  macro(add_backend backend_id cond_var)
-    if(${cond_var})
-      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
-    endif()
-  endmacro()
-else()
-  function(add_backend backend_id cond_var)
-    if(${cond_var})
-      include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
-    endif()
-  endfunction()
+if(NOT PROJECT_NAME STREQUAL "OpenCV")
+  include(FindPkgConfig)
 endif()
 
+macro(add_backend backend_id cond_var)
+  if(${cond_var})
+    include("${CMAKE_CURRENT_LIST_DIR}/detect_${backend_id}.cmake")
+  endif()
+endmacro()
+
 add_backend("ffmpeg" WITH_FFMPEG)
 add_backend("gstreamer" WITH_GSTREAMER)
 add_backend("v4l" WITH_V4L)
diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 348448bda7a8..16016e4b8e9a 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -186,6 +186,8 @@ enum VideoCaptureProperties {
        CAP_PROP_HW_ACCELERATION=50, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in cv::VideoCapture constructor / .open() method. Default value is backend-specific.
        CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
        CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
+       CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg back-end only)
+       CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg back-end only)
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
diff --git a/modules/videoio/src/cap_android_camera.cpp b/modules/videoio/src/cap_android_camera.cpp
index b369a12a6861..5952b6f08c48 100644
--- a/modules/videoio/src/cap_android_camera.cpp
+++ b/modules/videoio/src/cap_android_camera.cpp
@@ -304,8 +304,8 @@ class AndroidCameraCapture : public IVideoCapture
         AImage_getPlaneRowStride(image.get(), 0, &yStride);
         AImage_getPlaneRowStride(image.get(), 1, &uvStride);
         AImage_getPlaneData(image.get(), 0, &yPixel, &yLen);
-        AImage_getPlaneData(image.get(), 1, &vPixel, &vLen);
-        AImage_getPlaneData(image.get(), 2, &uPixel, &uLen);
+        AImage_getPlaneData(image.get(), 1, &uPixel, &uLen);
+        AImage_getPlaneData(image.get(), 2, &vPixel, &vLen);
         AImage_getPlanePixelStride(image.get(), 1, &uvPixelStride);
 
         if ( (uvPixelStride == 2) && (vPixel == uPixel + 1) && (yLen == frameWidth * frameHeight) && (uLen == ((yLen / 2) - 1)) && (vLen == uLen) ) {
@@ -313,7 +313,7 @@ class AndroidCameraCapture : public IVideoCapture
             if (fourCC == FOURCC_UNKNOWN) {
                 fourCC = FOURCC_NV21;
             }
-        } else if ( (uvPixelStride == 1) && (vPixel = uPixel + uLen) && (yLen == frameWidth * frameHeight) && (uLen == yLen / 4) && (vLen == uLen) ) {
+        } else if ( (uvPixelStride == 1) && (vPixel == uPixel + uLen) && (yLen == frameWidth * frameHeight) && (uLen == yLen / 4) && (vLen == uLen) ) {
             colorFormat = COLOR_FormatYUV420Planar;
             if (fourCC == FOURCC_UNKNOWN) {
                 fourCC = FOURCC_YV12;
diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp
index 1e73cb8fc881..9ec75501d040 100644
--- a/modules/videoio/src/cap_ffmpeg_impl.hpp
+++ b/modules/videoio/src/cap_ffmpeg_impl.hpp
@@ -183,8 +183,8 @@ extern "C" {
 #endif
 
 #if USE_AV_INTERRUPT_CALLBACK
-#define LIBAVFORMAT_INTERRUPT_OPEN_TIMEOUT_MS 30000
-#define LIBAVFORMAT_INTERRUPT_READ_TIMEOUT_MS 30000
+#define LIBAVFORMAT_INTERRUPT_OPEN_DEFAULT_TIMEOUT_MS 30000
+#define LIBAVFORMAT_INTERRUPT_READ_DEFAULT_TIMEOUT_MS 30000
 
 #ifdef _WIN32
 // http://stackoverflow.com/questions/5404277/porting-clock-gettime-to-windows
@@ -523,6 +523,8 @@ struct CvCapture_FFMPEG
 
     AVDictionary *dict;
 #if USE_AV_INTERRUPT_CALLBACK
+    int open_timeout;
+    int read_timeout;
     AVInterruptCallbackMetadata interrupt_metadata;
 #endif
 
@@ -569,6 +571,11 @@ void CvCapture_FFMPEG::init()
 #endif
     dict = NULL;
 
+#if USE_AV_INTERRUPT_CALLBACK
+    open_timeout = LIBAVFORMAT_INTERRUPT_OPEN_DEFAULT_TIMEOUT_MS;
+    read_timeout = LIBAVFORMAT_INTERRUPT_READ_DEFAULT_TIMEOUT_MS;
+#endif
+
     rawMode = false;
     rawModeInitialized = false;
     memset(&packet_filtered, 0, sizeof(packet_filtered));
@@ -928,6 +935,16 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
         if (params.has(CAP_PROP_HW_ACCELERATION_USE_OPENCL)) {
             use_opencl = params.get<int>(CAP_PROP_HW_ACCELERATION_USE_OPENCL);
         }
+#if USE_AV_INTERRUPT_CALLBACK
+        if (params.has(CAP_PROP_OPEN_TIMEOUT_MSEC))
+        {
+            open_timeout = params.get<int>(CAP_PROP_OPEN_TIMEOUT_MSEC);
+        }
+        if (params.has(CAP_PROP_READ_TIMEOUT_MSEC))
+        {
+            read_timeout = params.get<int>(CAP_PROP_READ_TIMEOUT_MSEC);
+        }
+#endif
         if (params.warnUnusedParameters())
         {
             CV_LOG_ERROR(NULL, "VIDEOIO/FFMPEG: unsupported parameters in .open(), see logger INFO channel for details. Bailout");
@@ -937,7 +954,7 @@ bool CvCapture_FFMPEG::open(const char* _filename, const VideoCaptureParameters&
 
 #if USE_AV_INTERRUPT_CALLBACK
     /* interrupt callback */
-    interrupt_metadata.timeout_after_ms = LIBAVFORMAT_INTERRUPT_OPEN_TIMEOUT_MS;
+    interrupt_metadata.timeout_after_ms = open_timeout;
     get_monotonic_time(&interrupt_metadata.value);
 
     ic = avformat_alloc_context();
@@ -1282,7 +1299,7 @@ bool CvCapture_FFMPEG::grabFrame()
 #if USE_AV_INTERRUPT_CALLBACK
     // activate interrupt callback
     get_monotonic_time(&interrupt_metadata.value);
-    interrupt_metadata.timeout_after_ms = LIBAVFORMAT_INTERRUPT_READ_TIMEOUT_MS;
+    interrupt_metadata.timeout_after_ms = read_timeout;
 #endif
 
 #if USE_AV_SEND_FRAME_API
diff --git a/modules/videoio/src/cap_gstreamer.cpp b/modules/videoio/src/cap_gstreamer.cpp
index 60ecf6611a87..e040a22cb0f1 100644
--- a/modules/videoio/src/cap_gstreamer.cpp
+++ b/modules/videoio/src/cap_gstreamer.cpp
@@ -475,8 +475,9 @@ bool GStreamerCapture::retrieveFrame(int, OutputArray dst)
     //     video/x-raw, format=I420  -> 8bit, 1 channel (height is 1.5x larger than true height)
     //     video/x-bayer             -> 8bit, 1 channel
     //     image/jpeg                -> 8bit, mjpeg: buffer_size x 1 x 1
+    //     video/x-raw, format=GRAY16_LE (BE) -> 16 bit, 1 channel
+    //     video/x-raw, format={BGRA, RGBA, BGRx, RGBx} -> 8bit, 4 channels
     // bayer data is never decoded, the user is responsible for that
-    // everything is 8 bit, so we just test the caps for bit depth
     Size sz = Size(frame_width, frame_height);
     guint n_planes = GST_VIDEO_INFO_N_PLANES(&info);
     if (name == "video/x-raw")
@@ -507,6 +508,24 @@ bool GStreamerCapture::retrieveFrame(int, OutputArray dst)
             src.copyTo(dst);
             return true;
         }
+        else if (format == "GRAY16_LE" || format == "GRAY16_BE")
+        {
+            CV_CheckEQ((int)n_planes, 1, "");
+            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step, (size_t)frame_width, "");
+            Mat src(sz, CV_16UC1, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            src.copyTo(dst);
+            return true;
+        }
+        else if (format == "BGRA" || format == "RGBA" || format == "BGRX" || format == "RGBX")
+        {
+            CV_CheckEQ((int)n_planes, 1, "");
+            size_t step = GST_VIDEO_INFO_PLANE_STRIDE(&info, 0);
+            CV_CheckGE(step, (size_t)frame_width, "");
+            Mat src(sz, CV_8UC4, map_info.data + GST_VIDEO_INFO_PLANE_OFFSET(&info, 0), step);
+            src.copyTo(dst);
+            return true;
+        }
         else if (format == "UYVY" || format == "YUY2" || format == "YVYU")
         {
             CV_CheckEQ((int)n_planes, 1, "");
@@ -1008,7 +1027,7 @@ bool GStreamerCapture::open(const String &filename_, const cv::VideoCaptureParam
         sink_pad.attach(gst_element_get_static_pad(sink, "sink"));
         peer_caps.attach(gst_pad_peer_query_caps(sink_pad, NULL));
         if (!gst_caps_can_intersect(caps, peer_caps)) {
-            caps.attach(gst_caps_from_string("video/x-raw, format=(string){UYVY,YUY2,YVYU,NV12,NV21,YV12,I420}"));
+            caps.attach(gst_caps_from_string("video/x-raw, format=(string){UYVY,YUY2,YVYU,NV12,NV21,YV12,I420,BGRA,RGBA,BGRx,RGBx,GRAY16_LE,GRAY16_BE}"));
             CV_Assert(caps);
         }
     }
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 73288c3d03b1..9e45fd1bacce 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -708,9 +708,10 @@ bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType& mt)
 _ComPtr<IMFAttributes> CvCapture_MSMF::getDefaultSourceConfig(UINT32 num)
 {
     CV_Assert(num > 0);
+    const bool OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS", true);
     _ComPtr<IMFAttributes> res;
     if (FAILED(MFCreateAttributes(&res, num)) ||
-        FAILED(res->SetUINT32(MF_READWRITE_ENABLE_HARDWARE_TRANSFORMS, true)) ||
+        FAILED(res->SetUINT32(MF_READWRITE_ENABLE_HARDWARE_TRANSFORMS, OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS)) ||
         FAILED(res->SetUINT32(MF_SOURCE_READER_DISABLE_DXVA, false)) ||
         FAILED(res->SetUINT32(MF_SOURCE_READER_ENABLE_VIDEO_PROCESSING, false)) ||
         FAILED(res->SetUINT32(MF_SOURCE_READER_ENABLE_ADVANCED_VIDEO_PROCESSING, true))
diff --git a/modules/videoio/src/container_avi.cpp b/modules/videoio/src/container_avi.cpp
index 2fb2ee14f81f..3223e7709096 100644
--- a/modules/videoio/src/container_avi.cpp
+++ b/modules/videoio/src/container_avi.cpp
@@ -124,6 +124,7 @@ struct RiffList
     uint32_t m_size;
     uint32_t m_list_type_cc;
 };
+#pragma pack(pop)
 
 class VideoInputStream
 {
@@ -149,7 +150,6 @@ class VideoInputStream
     String  m_fname;
 };
 
-#pragma pack(pop)
 
 inline VideoInputStream& operator >> (VideoInputStream& is, AviMainHeader& avih)
 {
diff --git a/modules/videoio/test/test_gstreamer.cpp b/modules/videoio/test/test_gstreamer.cpp
index ca100367b126..207f6de50baa 100644
--- a/modules/videoio/test/test_gstreamer.cpp
+++ b/modules/videoio/test/test_gstreamer.cpp
@@ -35,6 +35,10 @@ TEST_P(videoio_gstreamer, read_check)
 
         cvtColor(decode_frame, rgb_frame, convertToRGB);
         cvtColor(rgb_frame, gray_frame, COLOR_RGB2GRAY);
+        if (gray_frame.depth() == CV_16U)
+        {
+            gray_frame.convertTo(gray_frame, CV_8U, 255.0/65535);
+        }
 
         vector<Vec3f> circles;
         HoughCircles(gray_frame, circles, HOUGH_GRADIENT, 1, gray_frame.rows/16, 100, 30, 1, 30 );
@@ -58,6 +62,10 @@ TEST_P(videoio_gstreamer, read_check)
 
 static const Param test_data[] = {
     make_tuple("video/x-raw, format=BGR"  , Size(640, 480), Size(640, 480), COLOR_BGR2RGB),
+    make_tuple("video/x-raw, format=BGRA" , Size(640, 480), Size(640, 480), COLOR_BGRA2RGB),
+    make_tuple("video/x-raw, format=RGBA" , Size(640, 480), Size(640, 480), COLOR_RGBA2RGB),
+    make_tuple("video/x-raw, format=BGRx" , Size(640, 480), Size(640, 480), COLOR_BGRA2RGB),
+    make_tuple("video/x-raw, format=RGBx" , Size(640, 480), Size(640, 480), COLOR_RGBA2RGB),
     make_tuple("video/x-raw, format=GRAY8", Size(640, 480), Size(640, 480), COLOR_GRAY2RGB),
     make_tuple("video/x-raw, format=UYVY" , Size(640, 480), Size(640, 480), COLOR_YUV2RGB_UYVY),
     make_tuple("video/x-raw, format=YUY2" , Size(640, 480), Size(640, 480), COLOR_YUV2RGB_YUY2),
@@ -76,6 +84,10 @@ static const Param test_data[] = {
     make_tuple("video/x-raw, format=NV21" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_NV21),
     make_tuple("video/x-raw, format=YV12" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_YV12),
     make_tuple("video/x-raw, format=I420" , Size(322, 242), Size(322, 363), COLOR_YUV2RGB_I420),
+
+    // 16 bit
+    make_tuple("video/x-raw, format=GRAY16_LE", Size(640, 480), Size(640, 480), COLOR_GRAY2RGB),
+    make_tuple("video/x-raw, format=GRAY16_BE", Size(640, 480), Size(640, 480), COLOR_GRAY2RGB),
 };
 
 INSTANTIATE_TEST_CASE_P(videoio, videoio_gstreamer, testing::ValuesIn(test_data));
diff --git a/platforms/linux/gnu.toolchain.cmake b/platforms/linux/gnu.toolchain.cmake
index cba08e7fbbf4..64258e65b3f5 100644
--- a/platforms/linux/gnu.toolchain.cmake
+++ b/platforms/linux/gnu.toolchain.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.5)
 
 # load settings in case of "try compile"
 set(TOOLCHAIN_CONFIG_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/toolchain.config.cmake")
diff --git a/platforms/linux/riscv.toolchain.cmake b/platforms/linux/riscv.toolchain.cmake
index 2a69d7e0048d..cea80bd9ba90 100644
--- a/platforms/linux/riscv.toolchain.cmake
+++ b/platforms/linux/riscv.toolchain.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.5)
 
 if(COMMAND toolchain_save_config)
   return() # prevent recursive call
diff --git a/platforms/linux/riscv64-gcc.toolchain.cmake b/platforms/linux/riscv64-gcc.toolchain.cmake
index c46d62a360d3..675879f86b9f 100644
--- a/platforms/linux/riscv64-gcc.toolchain.cmake
+++ b/platforms/linux/riscv64-gcc.toolchain.cmake
@@ -10,8 +10,8 @@ set(CMAKE_CXX_COMPILER ${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-unknown-linux-gnu-g
 # Don't run the linker on compiler check
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 
-set(CMAKE_C_FLAGS "-march=rv64gcv_zvqmac ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zvqmac ${CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh ${CXX_FLAGS}")
 
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_SYSROOT})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
diff --git a/platforms/semihosting/aarch64-semihosting.toolchain.cmake b/platforms/semihosting/aarch64-semihosting.toolchain.cmake
new file mode 100644
index 000000000000..95bbda3bedba
--- /dev/null
+++ b/platforms/semihosting/aarch64-semihosting.toolchain.cmake
@@ -0,0 +1,40 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html
+
+set(CMAKE_SYSTEM_NAME               Generic)
+set(CMAKE_SYSTEM_PROCESSOR          AArch64)
+
+set(CMAKE_TRY_COMPILE_TARGET_TYPE   STATIC_LIBRARY)
+
+set(PORT_FILE ${CMAKE_SOURCE_DIR}/platforms/semihosting/include/aarch64_semihosting_port.hpp)
+
+set(COMMON_FLAGS "--specs=rdimon.specs -DOPENCV_INCLUDE_PORT_FILE=\\\"${PORT_FILE}\\\"")
+
+set(CMAKE_AR                        ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-ar${CMAKE_EXECUTABLE_SUFFIX})
+set(CMAKE_ASM_COMPILER              ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-gcc${CMAKE_EXECUTABLE_SUFFIX})
+set(CMAKE_C_COMPILER                ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-gcc${CMAKE_EXECUTABLE_SUFFIX})
+set(CMAKE_CXX_COMPILER              ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-g++${CMAKE_EXECUTABLE_SUFFIX})
+set(CMAKE_LINKER                    ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-ld${CMAKE_EXECUTABLE_SUFFIX})
+set(CMAKE_OBJCOPY                   ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-objcopy${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
+set(CMAKE_RANLIB                    ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-ranlib${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
+set(CMAKE_SIZE                      ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-size${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
+set(CMAKE_STRIP                     ${SEMIHOSTING_TOOLCHAIN_PATH}aarch64-none-elf-strip${CMAKE_EXECUTABLE_SUFFIX} CACHE INTERNAL "")
+set(CMAKE_C_FLAGS                   ${COMMON_FLAGS} CACHE INTERNAL "")
+set(CMAKE_CXX_FLAGS                 ${COMMON_FLAGS} CACHE INTERNAL "")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(OPENCV_SEMIHOSTING ON)
+set(OPENCV_DISABLE_THREAD_SUPPORT ON)
+set(OPENCV_DISABLE_FILESYSTEM_SUPPORT ON)
+set(BUILD_SHARED_LIBS OFF)
+set(OPENCV_FORCE_3RDPARTY_BUILD OFF)
+
+
+# Enable newlib.
+add_definitions(-D_GNU_SOURCE)
+
+add_definitions(-D_POSIX_PATH_MAX=0)
diff --git a/platforms/semihosting/include/aarch64_semihosting_port.hpp b/platforms/semihosting/include/aarch64_semihosting_port.hpp
new file mode 100644
index 000000000000..d3151c240a30
--- /dev/null
+++ b/platforms/semihosting/include/aarch64_semihosting_port.hpp
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef AARCH64_BAREMETAL_PORT_HPP
+#define AARCH64_BAREMETAL_PORT_HPP
+
+#include <malloc.h> // Needed for `memalign`.
+#include <sys/errno.h> // Needed for `ENOMEM`.
+
+// -std=c++11 is missing the following definitions when targeting
+// semihosting on aarch64.
+#if __cplusplus == 201103L
+#include <cmath>
+#define M_PI 3.14159265358979323846
+#define M_SQRT2 1.41421356237309504880
+
+namespace std {
+inline double cbrt(double x) {
+    return ::cbrt(x);
+}
+inline double copysign(double mag, double sgn) {
+    return ::copysign(mag, sgn);
+}
+} //namespace std
+#endif // __cplusplus == 201103L
+
+extern "C" {
+// Redirect the implementation of `posix_memalign` to `memalign`
+// as the former is
+// missing at link time. https://pubs.opengroup.org/onlinepubs/9699919799/functions/posix_memalign.html
+__attribute__((weak)) int posix_memalign(void **memptr, size_t alignment, size_t size) {
+    void * ptr =  memalign(alignment, size);
+    if (ptr != NULL) {
+        *memptr = ptr;
+        return 0;
+    }
+    return ENOMEM;
+}
+} // extern "C"
+
+#endif
diff --git a/platforms/winpack_dldt/2021.4/20210630-dldt-disable-multidevice-autoplugin.patch b/platforms/winpack_dldt/2021.4/20210630-dldt-disable-multidevice-autoplugin.patch
new file mode 100644
index 000000000000..f1e748744277
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/20210630-dldt-disable-multidevice-autoplugin.patch
@@ -0,0 +1,16 @@
+diff --git a/inference-engine/src/CMakeLists.txt b/inference-engine/src/CMakeLists.txt
+index 0ba0dd78..7d34e7cb 100644
+--- a/inference-engine/src/CMakeLists.txt
++++ b/inference-engine/src/CMakeLists.txt
+@@ -26,9 +26,9 @@ endif()
+ 
+ add_subdirectory(hetero_plugin)
+ 
+-add_subdirectory(auto_plugin)
++#add_subdirectory(auto_plugin)
+ 
+-add_subdirectory(multi_device)
++#add_subdirectory(multi_device)
+ 
+ add_subdirectory(transformations)
+ 
diff --git a/platforms/winpack_dldt/2021.4/20210630-dldt-disable-unused-targets.patch b/platforms/winpack_dldt/2021.4/20210630-dldt-disable-unused-targets.patch
new file mode 100644
index 000000000000..9d44cdadc6cd
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/20210630-dldt-disable-unused-targets.patch
@@ -0,0 +1,219 @@
+diff --git a/cmake/developer_package/add_ie_target.cmake b/cmake/developer_package/add_ie_target.cmake
+index d49f16a4d..2726ca787 100644
+--- a/cmake/developer_package/add_ie_target.cmake
++++ b/cmake/developer_package/add_ie_target.cmake
+@@ -92,7 +92,7 @@ function(addIeTarget)
+     if (ARG_TYPE STREQUAL EXECUTABLE)
+         add_executable(${ARG_NAME} ${all_sources})
+     elseif(ARG_TYPE STREQUAL STATIC OR ARG_TYPE STREQUAL SHARED)
+-        add_library(${ARG_NAME} ${ARG_TYPE} ${all_sources})
++        add_library(${ARG_NAME} ${ARG_TYPE} EXCLUDE_FROM_ALL ${all_sources})
+     else()
+         message(SEND_ERROR "Invalid target type ${ARG_TYPE} specified for target name ${ARG_NAME}")
+     endif()
+diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt
+index 1ac7fd8bf..df7091e51 100644
+--- a/inference-engine/CMakeLists.txt
++++ b/inference-engine/CMakeLists.txt
+@@ -39,7 +39,7 @@ if(ENABLE_TESTS)
+     add_subdirectory(tests)
+ endif()
+ 
+-add_subdirectory(tools)
++#add_subdirectory(tools)
+ 
+ function(ie_build_samples)
+     # samples should be build with the same flags as from OpenVINO package,
+@@ -58,7 +58,7 @@ endfunction()
+ 
+ # gflags and format_reader targets are kept inside of samples directory and
+ # they must be built even if samples build is disabled (required for tests and tools).
+-ie_build_samples()
++#ie_build_samples()
+ 
+ if(ENABLE_PYTHON)
+     add_subdirectory(ie_bridges/python)
+@@ -142,7 +142,7 @@ endif()
+ # Developer package
+ #
+ 
+-openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
++#openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
+ 
+ # for Template plugin
+ if(NGRAPH_INTERPRETER_ENABLE)
+@@ -166,7 +166,7 @@ function(ie_generate_dev_package_config)
+                 @ONLY)
+ endfunction()
+ 
+-ie_generate_dev_package_config()
++#ie_generate_dev_package_config()
+ 
+ #
+ # Coverage
+diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt
+index e8ed1a5c4..1fc9fc3ff 100644
+--- a/inference-engine/src/inference_engine/CMakeLists.txt
++++ b/inference-engine/src/inference_engine/CMakeLists.txt
+@@ -110,7 +110,7 @@ add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_sr
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS}
+             ${PUBLIC_HEADERS})
+@@ -181,7 +181,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_legacy_obj>
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>
+             ${IE_STATIC_DEPENDENT_FILES})
+diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt
+index 8eae82bd2..e0e6745b1 100644
+--- a/inference-engine/src/legacy_api/CMakeLists.txt
++++ b/inference-engine/src/legacy_api/CMakeLists.txt
+@@ -26,7 +26,7 @@ endif()
+ 
+ file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${PUBLIC_HEADERS})
+ 
+diff --git a/inference-engine/src/mkldnn_plugin/CMakeLists.txt b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+index fe57b29dd..07831e2fb 100644
+--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
++++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+@@ -67,7 +67,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ #  add test object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL ${SOURCES} ${HEADERS})
+ target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
+ 
+ target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
+diff --git a/inference-engine/src/preprocessing/CMakeLists.txt b/inference-engine/src/preprocessing/CMakeLists.txt
+index f9548339d..ef962145a 100644
+--- a/inference-engine/src/preprocessing/CMakeLists.txt
++++ b/inference-engine/src/preprocessing/CMakeLists.txt
+@@ -101,7 +101,7 @@ endif()
+ 
+ # Create object library
+ 
+-add_library(${TARGET_NAME}_obj OBJECT
++add_library(${TARGET_NAME}_obj OBJECT EXCLUDE_FROM_ALL
+             ${LIBRARY_SRC}
+             ${LIBRARY_HEADERS})
+ 
+@@ -153,7 +153,7 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
+ 
+ # Static library used for unit tests which are always built
+ 
+-add_library(${TARGET_NAME}_s STATIC
++add_library(${TARGET_NAME}_s STATIC EXCLUDE_FROM_ALL
+             $<TARGET_OBJECTS:${TARGET_NAME}_obj>)
+ 
+ set_ie_threading_interface_for(${TARGET_NAME}_s)
+diff --git a/inference-engine/src/vpu/common/CMakeLists.txt b/inference-engine/src/vpu/common/CMakeLists.txt
+index 249e47c28..4ddf63049 100644
+--- a/inference-engine/src/vpu/common/CMakeLists.txt
++++ b/inference-engine/src/vpu/common/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h)
+ 
+ function(add_common_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     ie_faster_build(${TARGET_NAME}
+         UNITY
+@@ -60,7 +60,7 @@ add_common_target("vpu_common_lib" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_common_target("vpu_common_lib_test_static" TRUE)
++    #add_common_target("vpu_common_lib_test_static" TRUE)
+ else()
+     add_library("vpu_common_lib_test_static" ALIAS "vpu_common_lib")
+ endif()
+diff --git a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+index bc73ab5b1..b4c1547fc 100644
+--- a/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
++++ b/inference-engine/src/vpu/graph_transformer/CMakeLists.txt
+@@ -5,7 +5,7 @@
+ file(GLOB_RECURSE SOURCES *.cpp *.hpp *.h *.inc)
+ 
+ function(add_graph_transformer_target TARGET_NAME STATIC_IE)
+-    add_library(${TARGET_NAME} STATIC ${SOURCES})
++    add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+     set_ie_threading_interface_for(${TARGET_NAME})
+ 
+@@ -70,7 +70,7 @@ add_graph_transformer_target("vpu_graph_transformer" FALSE)
+ 
+ # Unit tests support for graph transformer
+ if(WIN32)
+-    add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
++    #add_graph_transformer_target("vpu_graph_transformer_test_static" TRUE)
+ else()
+     add_library("vpu_graph_transformer_test_static" ALIAS "vpu_graph_transformer")
+ endif()
+diff --git a/inference-engine/thirdparty/pugixml/CMakeLists.txt b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+index 8bcb2801a..f7e031c01 100644
+--- a/inference-engine/thirdparty/pugixml/CMakeLists.txt
++++ b/inference-engine/thirdparty/pugixml/CMakeLists.txt
+@@ -41,7 +41,7 @@ if(BUILD_SHARED_LIBS)
+ else()
+ 	add_library(pugixml STATIC ${SOURCES})
+ 	if (MSVC)
+-		add_library(pugixml_mt STATIC ${SOURCES})
++                #add_library(pugixml_mt STATIC ${SOURCES})
+ 		#if (WIN32)
+ 		#	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+ 		#	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+diff --git a/ngraph/core/builder/CMakeLists.txt b/ngraph/core/builder/CMakeLists.txt
+index ff5c381e7..2797ec9ab 100644
+--- a/ngraph/core/builder/CMakeLists.txt
++++ b/ngraph/core/builder/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/ngraph/core/reference/CMakeLists.txt b/ngraph/core/reference/CMakeLists.txt
+index ef4a764ab..f6d3172e2 100644
+--- a/ngraph/core/reference/CMakeLists.txt
++++ b/ngraph/core/reference/CMakeLists.txt
+@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
+ source_group("include" FILES ${PUBLIC_HEADERS})
+ 
+ # Create shared library
+-add_library(${TARGET_NAME} STATIC ${LIBRARY_SRC} ${PUBLIC_HEADERS})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+ 
+ if(COMMAND ie_faster_build)
+     ie_faster_build(${TARGET_NAME}
+diff --git a/openvino/itt/CMakeLists.txt b/openvino/itt/CMakeLists.txt
+index e9f880b8c..c63f4df63 100644
+--- a/openvino/itt/CMakeLists.txt
++++ b/openvino/itt/CMakeLists.txt
+@@ -6,7 +6,7 @@ set(TARGET_NAME itt)
+ 
+ file(GLOB_RECURSE SOURCES "src/*.cpp" "src/*.hpp")
+ 
+-add_library(${TARGET_NAME} STATIC ${SOURCES})
++add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+ 
+ add_library(openvino::itt ALIAS ${TARGET_NAME})
+ 
diff --git a/platforms/winpack_dldt/2021.4/20210630-dldt-pdb.patch b/platforms/winpack_dldt/2021.4/20210630-dldt-pdb.patch
new file mode 100644
index 000000000000..65e6f84dc80b
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/20210630-dldt-pdb.patch
@@ -0,0 +1,15 @@
+iff --git a/CMakeLists.txt b/CMakeLists.txt
+index e0706a72e..9a053b1e4 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -6,6 +6,10 @@ cmake_minimum_required(VERSION 3.13)
+ 
+ project(OpenVINO)
+ 
++set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi /FS")
++set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
++
+ set(OpenVINO_MAIN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+ set(IE_MAIN_SOURCE_DIR ${OpenVINO_MAIN_SOURCE_DIR}/inference-engine)
+ 
diff --git a/platforms/winpack_dldt/2021.4/20210630-dldt-vs-version.patch b/platforms/winpack_dldt/2021.4/20210630-dldt-vs-version.patch
new file mode 100644
index 000000000000..36b0068775eb
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/20210630-dldt-vs-version.patch
@@ -0,0 +1,16 @@
+diff --git a/cmake/developer_package/vs_version/vs_version.cmake b/cmake/developer_package/vs_version/vs_version.cmake
+index 14d4c0e1e..6a44f73b9 100644
+--- a/cmake/developer_package/vs_version/vs_version.cmake
++++ b/cmake/developer_package/vs_version/vs_version.cmake
+@@ -8,9 +8,9 @@ set(IE_VS_VER_FILEVERSION_STR "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERS
+ 
+ set(IE_VS_VER_COMPANY_NAME_STR "Intel Corporation")
+ set(IE_VS_VER_PRODUCTVERSION_STR "${CI_BUILD_NUMBER}")
+-set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit")
++set(IE_VS_VER_PRODUCTNAME_STR "OpenVINO toolkit (for OpenCV Windows package)")
+ set(IE_VS_VER_COPYRIGHT_STR "Copyright (C) 2018-2021, Intel Corporation")
+-set(IE_VS_VER_COMMENTS_STR "https://docs.openvinotoolkit.org/")
++set(IE_VS_VER_COMMENTS_STR "https://github.com/opencv/opencv/wiki/Intel%27s-Deep-Learning-Inference-Engine-backend")
+ 
+ #
+ # ie_add_vs_version_file(NAME <name>
diff --git a/platforms/winpack_dldt/2021.4/build.config.py b/platforms/winpack_dldt/2021.4/build.config.py
new file mode 100644
index 000000000000..33ef1050cad4
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/build.config.py
@@ -0,0 +1 @@
+os.environ['CI_BUILD_NUMBER'] = '2021.4.0-opencv_winpack_dldt'
diff --git a/platforms/winpack_dldt/2021.4/patch.config.py b/platforms/winpack_dldt/2021.4/patch.config.py
new file mode 100644
index 000000000000..7f8715aae2da
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/patch.config.py
@@ -0,0 +1,4 @@
+applyPatch('20210630-dldt-disable-unused-targets.patch')
+applyPatch('20210630-dldt-pdb.patch')
+applyPatch('20210630-dldt-disable-multidevice-autoplugin.patch')
+applyPatch('20210630-dldt-vs-version.patch')
diff --git a/platforms/winpack_dldt/2021.4/sysroot.config.py b/platforms/winpack_dldt/2021.4/sysroot.config.py
new file mode 100644
index 000000000000..fa4281107d23
--- /dev/null
+++ b/platforms/winpack_dldt/2021.4/sysroot.config.py
@@ -0,0 +1,56 @@
+sysroot_bin_dir = prepare_dir(self.sysrootdir / 'bin')
+copytree(self.build_dir / 'install', self.sysrootdir / 'ngraph')
+#rm_one(self.sysrootdir / 'ngraph' / 'lib' / 'ngraph.dll')
+
+build_config = 'Release' if not self.config.build_debug else 'Debug'
+build_bin_dir = self.build_dir / 'bin' / 'intel64' / build_config
+
+def copy_bin(name):
+    global build_bin_dir, sysroot_bin_dir
+    copytree(build_bin_dir / name, sysroot_bin_dir / name)
+
+dll_suffix = 'd' if self.config.build_debug else ''
+def copy_dll(name):
+    global copy_bin, dll_suffix
+    copy_bin(name + dll_suffix + '.dll')
+    copy_bin(name + dll_suffix + '.pdb')
+
+copy_bin('cache.json')
+copy_dll('clDNNPlugin')
+copy_dll('HeteroPlugin')
+copy_dll('inference_engine')
+copy_dll('inference_engine_ir_reader')
+#copy_dll('inference_engine_ir_v7_reader')
+copy_dll('inference_engine_legacy')
+copy_dll('inference_engine_transformations')  # runtime
+copy_dll('inference_engine_lp_transformations')  # runtime
+#copy_dll('inference_engine_preproc')  # runtime
+copy_dll('MKLDNNPlugin')  # runtime
+copy_dll('myriadPlugin')  # runtime
+#copy_dll('MultiDevicePlugin')  # runtime, not used
+copy_dll('ngraph')
+copy_bin('plugins.xml')
+copy_bin('pcie-ma2x8x.elf')
+copy_bin('usb-ma2x8x.mvcmd')
+
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb' / 'bin', sysroot_bin_dir)
+copytree(self.srcdir / 'inference-engine' / 'temp' / 'tbb', self.sysrootdir / 'tbb')
+
+sysroot_ie_dir = prepare_dir(self.sysrootdir / 'deployment_tools' / 'inference_engine')
+sysroot_ie_lib_dir = prepare_dir(sysroot_ie_dir / 'lib' / 'intel64')
+
+copytree(self.srcdir / 'inference-engine' / 'include', sysroot_ie_dir / 'include')
+if not self.config.build_debug:
+    copytree(build_bin_dir / 'ngraph.lib', sysroot_ie_lib_dir / 'ngraph.lib')
+    copytree(build_bin_dir / 'inference_engine.lib', sysroot_ie_lib_dir / 'inference_engine.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_reader.lib', sysroot_ie_lib_dir / 'inference_engine_ir_reader.lib')
+    copytree(build_bin_dir / 'inference_engine_legacy.lib', sysroot_ie_lib_dir / 'inference_engine_legacy.lib')
+else:
+    copytree(build_bin_dir / 'ngraphd.lib', sysroot_ie_lib_dir / 'ngraphd.lib')
+    copytree(build_bin_dir / 'inference_engined.lib', sysroot_ie_lib_dir / 'inference_engined.lib')
+    copytree(build_bin_dir / 'inference_engine_ir_readerd.lib', sysroot_ie_lib_dir / 'inference_engine_ir_readerd.lib')
+    copytree(build_bin_dir / 'inference_engine_legacyd.lib', sysroot_ie_lib_dir / 'inference_engine_legacyd.lib')
+
+sysroot_license_dir = prepare_dir(self.sysrootdir / 'etc' / 'licenses')
+copytree(self.srcdir / 'LICENSE', sysroot_license_dir / 'dldt-LICENSE')
+copytree(self.sysrootdir / 'tbb/LICENSE', sysroot_license_dir / 'tbb-LICENSE')
diff --git a/platforms/winpack_dldt/build_package.py b/platforms/winpack_dldt/build_package.py
index c3f835cac324..ad2bbb796ca1 100644
--- a/platforms/winpack_dldt/build_package.py
+++ b/platforms/winpack_dldt/build_package.py
@@ -189,7 +189,10 @@ def __init__(self, config):
         if self.srcdir is None:
             self.srcdir = prepare_dir(self.outdir / 'sources', clean=clean_src_dir)
         self.build_dir = prepare_dir(self.outdir / 'build', clean=self.config.clean_dldt)
-        self.sysrootdir = prepare_dir(self.outdir / 'sysroot', clean=self.config.clean_dldt)
+        self.sysrootdir = prepare_dir(self.outdir / 'sysroot', clean=self.config.clean_dldt or self.config.clean_dldt_sysroot)
+        if not (self.config.clean_dldt or self.config.clean_dldt_sysroot):
+            _ = prepare_dir(self.sysrootdir / 'bin', clean=True)  # always clean sysroot/bin (package files)
+            _ = prepare_dir(self.sysrootdir / 'etc', clean=True)  # always clean sysroot/etc (package files)
 
         if self.config.build_subst_drive:
             if os.path.exists(self.config.build_subst_drive + ':\\'):
@@ -214,7 +217,7 @@ def init_patchset(self):
             patch_hashsum = hashlib.md5(self.patch_file_contents.encode('utf-8')).hexdigest()
         except:
             log.warn("Can't compute hashsum of patches: %s", self.patch_file)
-        self.patch_hashsum = patch_hashsum
+        self.patch_hashsum = self.config.override_patch_hashsum if self.config.override_patch_hashsum else patch_hashsum
 
 
     def prepare_sources(self):
@@ -355,7 +358,6 @@ def build(self, builderDLDT):
             BUILD_PERF_TESTS='OFF',
             ENABLE_CXX11='ON',
             WITH_INF_ENGINE='ON',
-            INF_ENGINE_RELEASE=str(self.config.dldt_release),
             WITH_TBB='ON',
             CPU_BASELINE='AVX2',
             CMAKE_INSTALL_PREFIX=str(self.install_dir),
@@ -383,6 +385,9 @@ def build(self, builderDLDT):
             OPENCV_PYTHON_INSTALL_PATH='python',
         )
 
+        if self.config.dldt_release:
+            cmake_vars['INF_ENGINE_RELEASE'] = str(self.config.dldt_release)
+
         cmake_vars['INF_ENGINE_LIB_DIRS:PATH'] = str(builderDLDT.sysrootdir / 'deployment_tools/inference_engine/lib/intel64')
         assert os.path.exists(cmake_vars['INF_ENGINE_LIB_DIRS:PATH']), cmake_vars['INF_ENGINE_LIB_DIRS:PATH']
         cmake_vars['INF_ENGINE_INCLUDE_DIRS:PATH'] = str(builderDLDT.sysrootdir / 'deployment_tools/inference_engine/include')
@@ -466,8 +471,8 @@ def package_sources(self):
 def main():
 
     dldt_src_url = 'https://github.com/openvinotoolkit/openvino'
-    dldt_src_commit = '2021.3'
-    dldt_release = '2021030000'
+    dldt_src_commit = '2021.4'
+    dldt_release = None
 
     build_cache_dir_default = os.environ.get('BUILD_CACHE_DIR', '.build_cache')
     build_subst_drive = os.environ.get('BUILD_SUBST_DRIVE', None)
@@ -483,8 +488,9 @@ def main():
     parser.add_argument('--cmake_option', action='append', help='Append OpenCV CMake option')
     parser.add_argument('--cmake_option_dldt', action='append', help='Append CMake option for DLDT project')
 
-    parser.add_argument('--clean_dldt', action='store_true', help='Clear DLDT build and sysroot directories')
-    parser.add_argument('--clean_opencv', action='store_true', help='Clear OpenCV build directory')
+    parser.add_argument('--clean_dldt', action='store_true', help='Clean DLDT build and sysroot directories')
+    parser.add_argument('--clean_dldt_sysroot', action='store_true', help='Clean DLDT sysroot directories')
+    parser.add_argument('--clean_opencv', action='store_true', help='Clean OpenCV build directory')
 
     parser.add_argument('--build_debug', action='store_true', help='Build debug binaries')
     parser.add_argument('--build_tests', action='store_true', help='Build OpenCV tests')
@@ -494,13 +500,15 @@ def main():
     parser.add_argument('--dldt_src_branch', help='DLDT checkout branch')
     parser.add_argument('--dldt_src_commit', default=dldt_src_commit, help='DLDT source commit / tag (default: %s)' % dldt_src_commit)
     parser.add_argument('--dldt_src_git_clone_extra', action='append', help='DLDT git clone extra args')
-    parser.add_argument('--dldt_release', default=dldt_release, help='DLDT release code for INF_ENGINE_RELEASE (default: %s)' % dldt_release)
+    parser.add_argument('--dldt_release', default=dldt_release, help='DLDT release code for INF_ENGINE_RELEASE, e.g 2021030000 (default: %s)' % dldt_release)
 
     parser.add_argument('--dldt_reference_dir', help='DLDT reference git repository (optional)')
     parser.add_argument('--dldt_src_dir', help='DLDT custom source repository (skip git checkout and patching, use for TESTING only)')
 
     parser.add_argument('--dldt_config', help='Specify DLDT build configuration (defaults to evaluate from DLDT commit/branch)')
 
+    parser.add_argument('--override_patch_hashsum', default='', help='(script debug mode)')
+
     args = parser.parse_args()
 
     log.basicConfig(
diff --git a/platforms/winrt/readme.txt b/platforms/winrt/readme.txt
index 2fb4ce1f54c4..2d1b4e6c30c1 100644
--- a/platforms/winrt/readme.txt
+++ b/platforms/winrt/readme.txt
@@ -13,7 +13,7 @@ Install Visual Studio 2013 Community Edition
     http://go.microsoft.com/?linkid=9863608
 
 Install Visual Studio Express 2012 for Windows Desktop
-    http://www.microsoft.com/en-us/download/details.aspx?id=34673
+    https://devblogs.microsoft.com/visualstudio/visual-studio-express-2012-for-windows-desktop-is-here/
 
 
 
@@ -156,4 +156,4 @@ Manual build
 
   cmake -G "Visual Studio 12 2013 Win64" -DCMAKE_SYSTEM_NAME:String=WindowsStore -DCMAKE_SYSTEM_VERSION:String=8.1 -DCMAKE_VS_EFFECTIVE_PLATFORMS:String=x64 -DCMAKE_INSTALL_PREFIX:PATH=.\install\WS\8.1\x64\ ..
 
-Return to "Running tests for Windows Store", list item 4.
\ No newline at end of file
+Return to "Running tests for Windows Store", list item 4.
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 0c70698ccbf6..9bfc2bf8ada4 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -45,7 +45,12 @@ endif()
 if(INSTALL_PYTHON_EXAMPLES)
   add_subdirectory(python)
 endif()
-
+# The examples in this folder will work with a semihosting version of
+# OpenCV. For more information about semihosting, see
+# https://developer.arm.com/documentation/100863/latest
+if(OPENCV_SEMIHOSTING)
+  add_subdirectory(semihosting)
+endif()
 ocv_install_example_src("." CMakeLists.txt samples_utils.cmake)
 if(INSTALL_C_EXAMPLES)
   install(DIRECTORY data DESTINATION "${OPENCV_SAMPLES_SRC_INSTALL_PATH}" COMPONENT samples_data)
diff --git a/samples/cpp/grabcut.cpp b/samples/cpp/grabcut.cpp
index d3e3db49f9d1..25492166a781 100644
--- a/samples/cpp/grabcut.cpp
+++ b/samples/cpp/grabcut.cpp
@@ -107,12 +107,14 @@ void GCApplication::showImage() const
 
     Mat res;
     Mat binMask;
-    if( !isInitialized )
-        image->copyTo( res );
-    else
-    {
-        getBinMask( mask, binMask );
-        image->copyTo( res, binMask );
+    image->copyTo( res );
+    if( isInitialized ){
+        getBinMask( mask, binMask);
+
+        Mat black (binMask.rows, binMask.cols, CV_8UC3, cv::Scalar(0,0,0));
+        black.setTo(Scalar::all(255), binMask);
+
+        addWeighted(black, 0.5, res, 0.5, 0.0, res);
     }
 
     vector<Point>::const_iterator it;
@@ -201,24 +203,39 @@ void GCApplication::mouseClick( int event, int x, int y, int flags, void* )
     case EVENT_LBUTTONUP:
         if( rectState == IN_PROCESS )
         {
-            rect = Rect( Point(rect.x, rect.y), Point(x,y) );
-            rectState = SET;
-            setRectInMask();
-            CV_Assert( bgdPxls.empty() && fgdPxls.empty() && prBgdPxls.empty() && prFgdPxls.empty() );
+            if(rect.x == x || rect.y == y){
+                rectState = NOT_SET;
+            }
+            else{
+                rect = Rect( Point(rect.x, rect.y), Point(x,y) );
+                rectState = SET;
+                setRectInMask();
+                CV_Assert( bgdPxls.empty() && fgdPxls.empty() && prBgdPxls.empty() && prFgdPxls.empty() );
+            }
             showImage();
         }
         if( lblsState == IN_PROCESS )
         {
             setLblsInMask(flags, Point(x,y), false);
             lblsState = SET;
+            nextIter();
             showImage();
         }
+        else{
+            if(rectState == SET){
+                nextIter();
+                showImage();
+            }
+        }
         break;
     case EVENT_RBUTTONUP:
         if( prLblsState == IN_PROCESS )
         {
             setLblsInMask(flags, Point(x,y), true);
             prLblsState = SET;
+        }
+        if(rectState == SET){
+            nextIter();
             showImage();
         }
         break;
diff --git a/samples/cpp/kalman.cpp b/samples/cpp/kalman.cpp
index 501a749124c6..daf0ba5a7150 100644
--- a/samples/cpp/kalman.cpp
+++ b/samples/cpp/kalman.cpp
@@ -1,6 +1,6 @@
 #include "opencv2/video/tracking.hpp"
 #include "opencv2/highgui.hpp"
-
+#include "opencv2/core/cvdef.h"
 #include <stdio.h>
 
 using namespace cv;
@@ -14,15 +14,19 @@ static void help()
 {
     printf( "\nExample of c calls to OpenCV's Kalman filter.\n"
 "   Tracking of rotating point.\n"
-"   Rotation speed is constant.\n"
+"   Point moves in a circle and is characterized by a 1D state.\n"
+"   state_k+1 = state_k + speed + process_noise N(0, 1e-5)\n"
+"   The speed is constant.\n"
 "   Both state and measurements vectors are 1D (a point angle),\n"
-"   Measurement is the real point angle + gaussian noise.\n"
-"   The real and the estimated points are connected with yellow line segment,\n"
-"   the real and the measured points are connected with red line segment.\n"
+"   Measurement is the real state + gaussian noise N(0, 1e-1).\n"
+"   The real and the measured points are connected with red line segment,\n"
+"   the real and the estimated points are connected with yellow line segment,\n"
+"   the real and the corrected estimated points are connected with green line segment.\n"
 "   (if Kalman filter works correctly,\n"
-"    the yellow segment should be shorter than the red one).\n"
+"    the yellow segment should be shorter than the red one and\n"
+"    the green segment should be shorter than the yellow one)."
             "\n"
-"   Pressing any key (except ESC) will reset the tracking with a different speed.\n"
+"   Pressing any key (except ESC) will reset the tracking.\n"
 "   Pressing ESC will stop the program.\n"
             );
 }
@@ -39,7 +43,9 @@ int main(int, char**)
 
     for(;;)
     {
-        randn( state, Scalar::all(0), Scalar::all(0.1) );
+        img = Scalar::all(0);
+        state.at<float>(0) = 0.0f;
+        state.at<float>(1) = 2.f * (float)CV_PI / 6;
         KF.transitionMatrix = (Mat_<float>(2, 2) << 1, 1, 0, 1);
 
         setIdentity(KF.measurementMatrix);
@@ -60,36 +66,40 @@ int main(int, char**)
             double predictAngle = prediction.at<float>(0);
             Point predictPt = calcPoint(center, R, predictAngle);
 
-            randn( measurement, Scalar::all(0), Scalar::all(KF.measurementNoiseCov.at<float>(0)));
-
             // generate measurement
+            randn( measurement, Scalar::all(0), Scalar::all(KF.measurementNoiseCov.at<float>(0)));
             measurement += KF.measurementMatrix*state;
 
             double measAngle = measurement.at<float>(0);
             Point measPt = calcPoint(center, R, measAngle);
 
+            // correct the state estimates based on measurements
+            // updates statePost & errorCovPost
+            KF.correct(measurement);
+            double improvedAngle = KF.statePost.at<float>(0);
+            Point improvedPt = calcPoint(center, R, improvedAngle);
+
             // plot points
-            #define drawCross( center, color, d )                                        \
-                line( img, Point( center.x - d, center.y - d ),                          \
-                             Point( center.x + d, center.y + d ), color, 1, LINE_AA, 0); \
-                line( img, Point( center.x + d, center.y - d ),                          \
-                             Point( center.x - d, center.y + d ), color, 1, LINE_AA, 0 )
-
-            img = Scalar::all(0);
-            drawCross( statePt, Scalar(255,255,255), 3 );
-            drawCross( measPt, Scalar(0,0,255), 3 );
-            drawCross( predictPt, Scalar(0,255,0), 3 );
-            line( img, statePt, measPt, Scalar(0,0,255), 3, LINE_AA, 0 );
-            line( img, statePt, predictPt, Scalar(0,255,255), 3, LINE_AA, 0 );
-
-            if(theRNG().uniform(0,4) != 0)
-                KF.correct(measurement);
+            img = img * 0.2;
+            drawMarker(img, measPt, Scalar(0, 0, 255), cv::MARKER_SQUARE, 5, 2);
+            drawMarker(img, predictPt, Scalar(0, 255, 255), cv::MARKER_SQUARE, 5, 2);
+            drawMarker(img, improvedPt, Scalar(0, 255, 0), cv::MARKER_SQUARE, 5, 2);
+            drawMarker(img, statePt, Scalar(255, 255, 255), cv::MARKER_STAR, 10, 1);
+            // forecast one step
+            Mat test = Mat(KF.transitionMatrix*KF.statePost);
+            drawMarker(img, calcPoint(center, R, Mat(KF.transitionMatrix*KF.statePost).at<float>(0)),
+                       Scalar(255, 255, 0), cv::MARKER_SQUARE, 12, 1);
+
+            line( img, statePt, measPt, Scalar(0,0,255), 1, LINE_AA, 0 );
+            line( img, statePt, predictPt, Scalar(0,255,255), 1, LINE_AA, 0 );
+            line( img, statePt, improvedPt, Scalar(0,255,0), 1, LINE_AA, 0 );
+
 
             randn( processNoise, Scalar(0), Scalar::all(sqrt(KF.processNoiseCov.at<float>(0, 0))));
             state = KF.transitionMatrix*state + processNoise;
 
             imshow( "Kalman", img );
-            code = (char)waitKey(100);
+            code = (char)waitKey(1000);
 
             if( code > 0 )
                 break;
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
index 5bcc878965a2..f9abbae94527 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/MatchTemplate_Demo.cpp
@@ -89,7 +89,7 @@ void MatchingMethod( int, void* )
 
   //! [create_result_matrix]
   /// Create the result matrix
-  int result_cols =  img.cols - templ.cols + 1;
+  int result_cols = img.cols - templ.cols + 1;
   int result_rows = img.rows - templ.rows + 1;
 
   result.create( result_rows, result_cols, CV_32FC1 );
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
index 61b6d607ceb6..bcb547a2fb9f 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/calcBackProject_Demo1.cpp
@@ -72,18 +72,18 @@ void Hist_and_Backproj(int, void* )
     //! [initialize]
     int histSize = MAX( bins, 2 );
     float hue_range[] = { 0, 180 };
-    const float* ranges = { hue_range };
+    const float* ranges[] = { hue_range };
     //! [initialize]
 
     //! [Get the Histogram and normalize it]
     Mat hist;
-    calcHist( &hue, 1, 0, Mat(), hist, 1, &histSize, &ranges, true, false );
+    calcHist( &hue, 1, 0, Mat(), hist, 1, &histSize, ranges, true, false );
     normalize( hist, hist, 0, 255, NORM_MINMAX, -1, Mat() );
     //! [Get the Histogram and normalize it]
 
     //! [Get Backprojection]
     Mat backproj;
-    calcBackProject( &hue, 1, 0, hist, backproj, &ranges, 1, true );
+    calcBackProject( &hue, 1, 0, hist, backproj, ranges, 1, true );
     //! [Get Backprojection]
 
     //! [Draw the backproj]
diff --git a/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp b/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
index 86167e519a2f..a7582e42820a 100644
--- a/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
+++ b/samples/cpp/tutorial_code/Histograms_Matching/calcHist_Demo.cpp
@@ -37,7 +37,7 @@ int main(int argc, char** argv)
 
     //! [Set the ranges ( for B,G,R) )]
     float range[] = { 0, 256 }; //the upper boundary is exclusive
-    const float* histRange = { range };
+    const float* histRange[] = { range };
     //! [Set the ranges ( for B,G,R) )]
 
     //! [Set histogram param]
@@ -46,9 +46,9 @@ int main(int argc, char** argv)
 
     //! [Compute the histograms]
     Mat b_hist, g_hist, r_hist;
-    calcHist( &bgr_planes[0], 1, 0, Mat(), b_hist, 1, &histSize, &histRange, uniform, accumulate );
-    calcHist( &bgr_planes[1], 1, 0, Mat(), g_hist, 1, &histSize, &histRange, uniform, accumulate );
-    calcHist( &bgr_planes[2], 1, 0, Mat(), r_hist, 1, &histSize, &histRange, uniform, accumulate );
+    calcHist( &bgr_planes[0], 1, 0, Mat(), b_hist, 1, &histSize, histRange, uniform, accumulate );
+    calcHist( &bgr_planes[1], 1, 0, Mat(), g_hist, 1, &histSize, histRange, uniform, accumulate );
+    calcHist( &bgr_planes[2], 1, 0, Mat(), r_hist, 1, &histSize, histRange, uniform, accumulate );
     //! [Compute the histograms]
 
     //! [Draw the histograms for B, G and R]
diff --git a/samples/dnn/colorization.cpp b/samples/dnn/colorization.cpp
index b68e0ec4d8bf..6d751590d10d 100644
--- a/samples/dnn/colorization.cpp
+++ b/samples/dnn/colorization.cpp
@@ -50,7 +50,7 @@ int main(int argc, char **argv)
         "  https://github.com/richzhang/colorization\n"
         "Download caffemodel and prototxt files:\n"
         "  http://eecs.berkeley.edu/~rich.zhang/projects/2016_colorization/files/demo_v2/colorization_release_v2.caffemodel\n"
-        "  https://raw.githubusercontent.com/richzhang/colorization/master/colorization/models/colorization_deploy_v2.prototxt\n";
+        "  https://raw.githubusercontent.com/richzhang/colorization/caffe/models/colorization_deploy_v2.prototxt\n";
     const string keys =
         "{ h help |                                    | print this help message }"
         "{ proto  | colorization_deploy_v2.prototxt    | model configuration }"
diff --git a/samples/dnn/colorization.py b/samples/dnn/colorization.py
index c9eb2af3b668..5bdef9793e30 100644
--- a/samples/dnn/colorization.py
+++ b/samples/dnn/colorization.py
@@ -1,6 +1,6 @@
 # Script is based on https://github.com/richzhang/colorization/blob/master/colorization/colorize.py
-# To download the caffemodel and the prototxt, see: https://github.com/richzhang/colorization/tree/master/colorization/models
-# To download pts_in_hull.npy, see: https://github.com/richzhang/colorization/blob/master/colorization/resources/pts_in_hull.npy
+# To download the caffemodel and the prototxt, see: https://github.com/richzhang/colorization/tree/caffe/colorization/models
+# To download pts_in_hull.npy, see: https://github.com/richzhang/colorization/tree/caffe/colorization/resources/pts_in_hull.npy
 import numpy as np
 import argparse
 import cv2 as cv
diff --git a/samples/gdb/gdbinit b/samples/gdb/gdbinit
new file mode 100644
index 000000000000..228e8f702367
--- /dev/null
+++ b/samples/gdb/gdbinit
@@ -0,0 +1,23 @@
+set auto-load local-gdbinit on
+set print elements 0
+add-auto-load-safe-path /
+
+python
+# Update GDB's Python paths with the `sys.path` values of the local
+#  Python installation, whether that is brew'ed Python, a virtualenv,
+#  or another system python.
+
+# Convert GDB to interpret in Python
+
+import os, subprocess, sys
+
+# Execute a Python using the user's shell and pull out the sys.path (for site-packages)
+paths = subprocess.check_output('/usr/bin/python3 -c "import os,sys;print(os.linesep.join(sys.path).strip())"',shell=True).decode("utf-8").split()
+
+# Extend GDB's Python's search path
+sys.path.extend(paths)
+
+end
+
+
+source /your/path/to/mat_pretty_printer.py
diff --git a/samples/gdb/mat_pretty_printer.py b/samples/gdb/mat_pretty_printer.py
new file mode 100644
index 000000000000..e6ad2cbde212
--- /dev/null
+++ b/samples/gdb/mat_pretty_printer.py
@@ -0,0 +1,212 @@
+import gdb
+import numpy as np
+from enum import Enum
+
+np.set_printoptions(suppress=True)  # prevent numpy exponential notation on print, default False
+# np.set_printoptions(threshold=sys.maxsize)
+
+
+def conv(obj, t):
+    return gdb.parse_and_eval(f'({t})({obj})')
+
+
+def booli(obj):
+    return conv(str(obj).lower(), 'bool')
+
+
+def stri(obj):
+    s = f'"{obj}"'
+    return conv(s.translate(s.maketrans('\n', ' ')), 'char*')
+
+
+class MagicValues(Enum):
+    MAGIC_VAL = 0x42FF0000
+    AUTO_STEP = 0
+    CONTINUOUS_FLAG = 1 << 14
+    SUBMATRIX_FLAG = 1 << 15
+
+
+class MagicMasks(Enum):
+    MAGIC_MASK = 0xFFFF0000
+    TYPE_MASK = 0x00000FFF
+    DEPTH_MASK = 7
+
+
+class Depth(Enum):
+    CV_8U = 0
+    CV_8S = 1
+    CV_16U = 2
+    CV_16S = 3
+    CV_32S = 4
+    CV_32F = 5
+    CV_64F = 6
+    CV_16F = 7
+
+
+def create_enum(n):
+    def make_type(depth, cn):
+        return depth.value + ((cn - 1) << 3)
+    defs = [(f'{depth.name}C{i}', make_type(depth, i)) for depth in Depth for i in range(1, n + 1)]
+    return Enum('Type', defs)
+
+
+Type = create_enum(512)
+
+
+class Flags:
+    def depth(self):
+        return Depth(self.flags & MagicMasks.DEPTH_MASK.value)
+
+    def dtype(self):
+        depth = self.depth()
+        ret = None
+
+        if depth == Depth.CV_8U:
+            ret = (np.uint8, 'uint8_t')
+        elif depth == Depth.CV_8S:
+            ret = (np.int8, 'int8_t')
+        elif depth == Depth.CV_16U:
+            ret = (np.uint16, 'uint16_t')
+        elif depth == Depth.CV_16S:
+            ret = (np.int16, 'int16_t')
+        elif depth == Depth.CV_32S:
+            ret = (np.int32, 'int32_t')
+        elif depth == Depth.CV_32F:
+            ret = (np.float32, 'float')
+        elif depth == Depth.CV_64F:
+            ret = (np.float64, 'double')
+        elif depth == Depth.CV_16F:
+            ret = (np.float16, 'float16')
+
+        return ret
+
+    def type(self):
+        return Type(self.flags & MagicMasks.TYPE_MASK.value)
+
+    def channels(self):
+        return ((self.flags & (511 << 3)) >> 3) + 1
+
+    def is_continuous(self):
+        return (self.flags & MagicValues.CONTINUOUS_FLAG.value) != 0
+
+    def is_submatrix(self):
+        return (self.flags & MagicValues.SUBMATRIX_FLAG.value) != 0
+
+    def __init__(self, flags):
+        self.flags = flags
+
+    def __iter__(self):
+        return iter({
+                        'type': stri(self.type().name),
+                        'is_continuous': booli(self.is_continuous()),
+                        'is_submatrix': booli(self.is_submatrix())
+                    }.items())
+
+
+class Size:
+    def __init__(self, ptr):
+        self.ptr = ptr
+
+    def dims(self):
+        return int((self.ptr - 1).dereference())
+
+    def to_numpy(self):
+        return np.array([int(self.ptr[i]) for i in range(self.dims())], dtype=np.int64)
+
+    def __iter__(self):
+        return iter({'size': stri(self.to_numpy())}.items())
+
+
+class Mat:
+    def __init__(self, m, size, flags):
+        (dtype, ctype) = flags.dtype()
+        elsize = np.dtype(dtype).itemsize
+
+        ptr = m['data']
+        dataptr = int(ptr)
+        length = (int(m['dataend']) - dataptr) // elsize
+        start = (int(m['datastart']) - dataptr) // elsize
+
+        if length == 0:
+            self.mat = np.array([])
+            self.view = self.mat
+            return
+
+        if dtype != np.float16:
+            ctype = gdb.lookup_type(ctype)
+            ptr = ptr.cast(ctype.array(length - 1).pointer()).dereference()
+            self.mat = np.array([ptr[i] for i in range(length)], dtype=dtype)
+        else:
+            u16 = gdb.lookup_type('uint16_t')
+            ptr = ptr.cast(u16.array(length - 1).pointer()).dereference()
+            self.mat = np.array([ptr[i] for i in range(length)], dtype=np.uint16)
+            self.mat = self.mat.view(np.float16)
+
+        steps = np.asarray([int(m['step']['p'][i]) for i in range(size.dims())], dtype=np.int64)
+        self.view = np.lib.stride_tricks.as_strided(self.mat[start:], shape=size.to_numpy(), strides=steps)
+
+    def __iter__(self):
+        return iter({'data': stri(self.view)}.items())
+
+
+class MatPrinter:
+    """Print a cv::Mat"""
+
+    def __init__(self, mat):
+        self.mat = mat
+
+    def views(self):
+        m = self.mat
+
+        flags = Flags(int(m['flags']))
+        size = Size(m['size']['p'])
+        data = Mat(m, size, flags)
+
+        for x in [flags, size, data]:
+            for k, v in x:
+                yield 'view_' + k, v
+
+    def real(self):
+        m = self.mat
+
+        for field in m.type.fields():
+            k = field.name
+            v = m[k]
+            yield k, v
+
+        # TODO: add an enum in interface.h with all cv::Mat element types and use that instead
+        # yield 'test', gdb.parse_and_eval(f'(cv::MatTypes)0')
+
+    def children(self):  # TODO: hide real members under new child somehow
+        yield from self.views()
+        yield from self.real()
+
+
+def get_type(val):
+    # Get the type.
+    vtype = val.type
+
+    # If it points to a reference, get the reference.
+    if vtype.code == gdb.TYPE_CODE_REF:
+        vtype = vtype.target()
+
+    # Get the unqualified type, stripped of typedefs.
+    vtype = vtype.unqualified().strip_typedefs()
+
+    # Get the type name.
+    typename = vtype.tag
+
+    return typename
+
+
+def mat_printer(val):
+    typename = get_type(val)
+
+    if typename is None:
+        return None
+
+    if str(typename) == 'cv::Mat':
+        return MatPrinter(val)
+
+
+gdb.pretty_printers.append(mat_printer)
diff --git a/samples/python/camera_calibration_show_extrinsics.py b/samples/python/camera_calibration_show_extrinsics.py
index 0118b5b913d5..d676691f15d7 100755
--- a/samples/python/camera_calibration_show_extrinsics.py
+++ b/samples/python/camera_calibration_show_extrinsics.py
@@ -188,7 +188,7 @@ def main():
 
     fig = plt.figure()
     ax = fig.gca(projection='3d')
-    ax.set_aspect("equal")
+    ax.set_aspect("auto")
 
     cam_width = args.cam_width
     cam_height = args.cam_height
diff --git a/samples/python/gaussian_mix.py b/samples/python/gaussian_mix.py
index 5f2dfcc44093..4c1f86794cd6 100755
--- a/samples/python/gaussian_mix.py
+++ b/samples/python/gaussian_mix.py
@@ -28,11 +28,11 @@ def make_gaussians(cluster_n, img_size):
     return points, ref_distrs
 
 def draw_gaussain(img, mean, cov, color):
-    x, y = np.int32(mean)
+    x, y = mean
     w, u, _vt = cv.SVDecomp(cov)
     ang = np.arctan2(u[1, 0], u[0, 0])*(180/np.pi)
     s1, s2 = np.sqrt(w)*3.0
-    cv.ellipse(img, (x, y), (s1, s2), ang, 0, 360, color, 1, cv.LINE_AA)
+    cv.ellipse(img, (int(x), int(y)), (int(s1), int(s2)), ang, 0, 360, color, 1, cv.LINE_AA)
 
 
 def main():
diff --git a/samples/python/hist.py b/samples/python/hist.py
index 4c2c1ad395ef..8c1f4546a817 100755
--- a/samples/python/hist.py
+++ b/samples/python/hist.py
@@ -46,9 +46,9 @@ def hist_lines(im):
         im = cv.cvtColor(im,cv.COLOR_BGR2GRAY)
     hist_item = cv.calcHist([im],[0],None,[256],[0,256])
     cv.normalize(hist_item,hist_item,0,255,cv.NORM_MINMAX)
-    hist=np.int32(np.around(hist_item))
+    hist = np.int32(np.around(hist_item))
     for x,y in enumerate(hist):
-        cv.line(h,(x,0),(x,y),(255,255,255))
+        cv.line(h,(x,0),(x,y[0]),(255,255,255))
     y = np.flipud(h)
     return y
 
diff --git a/samples/python/kalman.py b/samples/python/kalman.py
index 654e3de3da0d..cf152a8700fd 100755
--- a/samples/python/kalman.py
+++ b/samples/python/kalman.py
@@ -1,14 +1,18 @@
 #!/usr/bin/env python
 """
    Tracking of rotating point.
-   Rotation speed is constant.
+   Point moves in a circle and is characterized by a 1D state.
+   state_k+1 = state_k + speed + process_noise N(0, 1e-5)
+   The speed is constant.
    Both state and measurements vectors are 1D (a point angle),
-   Measurement is the real point angle + gaussian noise.
-   The real and the estimated points are connected with yellow line segment,
-   the real and the measured points are connected with red line segment.
+   Measurement is the real state + gaussian noise N(0, 1e-1).
+   The real and the measured points are connected with red line segment,
+   the real and the estimated points are connected with yellow line segment,
+   the real and the corrected estimated points are connected with green line segment.
    (if Kalman filter works correctly,
-    the yellow segment should be shorter than the red one).
-   Pressing any key (except ESC) will reset the tracking with a different speed.
+    the yellow segment should be shorter than the red one and
+    the green segment should be shorter than the yellow one).
+   Pressing any key (except ESC) will reset the tracking.
    Pressing ESC will stop the program.
 """
 # Python 2/3 compatibility
@@ -21,8 +25,7 @@
 import numpy as np
 import cv2 as cv
 
-from math import cos, sin, sqrt
-import numpy as np
+from math import cos, sin, sqrt, pi
 
 def main():
     img_height = 500
@@ -30,64 +33,62 @@ def main():
     kalman = cv.KalmanFilter(2, 1, 0)
 
     code = long(-1)
-
-    cv.namedWindow("Kalman")
-
+    num_circle_steps = 12
     while True:
-        state = 0.1 * np.random.randn(2, 1)
-
-        kalman.transitionMatrix = np.array([[1., 1.], [0., 1.]])
-        kalman.measurementMatrix = 1. * np.ones((1, 2))
-        kalman.processNoiseCov = 1e-5 * np.eye(2)
-        kalman.measurementNoiseCov = 1e-1 * np.ones((1, 1))
-        kalman.errorCovPost = 1. * np.ones((2, 2))
-        kalman.statePost = 0.1 * np.random.randn(2, 1)
+        img = np.zeros((img_height, img_width, 3), np.uint8)
+        state = np.array([[0.0],[(2 * pi) / num_circle_steps]])   # start state
+        kalman.transitionMatrix = np.array([[1., 1.], [0., 1.]])  # F. input
+        kalman.measurementMatrix = 1. * np.eye(1, 2)              # H. input
+        kalman.processNoiseCov = 1e-5 * np.eye(2)                 # Q. input
+        kalman.measurementNoiseCov = 1e-1 * np.ones((1, 1))       # R. input
+        kalman.errorCovPost = 1. * np.eye(2, 2)                   # P._k|k  KF state var
+        kalman.statePost = 0.1 * np.random.randn(2, 1)            # x^_k|k  KF state var
 
         while True:
             def calc_point(angle):
-                return (np.around(img_width/2 + img_width/3*cos(angle), 0).astype(int),
-                        np.around(img_height/2 - img_width/3*sin(angle), 1).astype(int))
-
+                return (np.around(img_width / 2. + img_width / 3.0 * cos(angle), 0).astype(int),
+                        np.around(img_height / 2. - img_width / 3.0 * sin(angle), 1).astype(int))
+            img = img * 1e-3
             state_angle = state[0, 0]
             state_pt = calc_point(state_angle)
-
+            # advance Kalman filter to next timestep
+            # updates statePre, statePost, errorCovPre, errorCovPost
+            # k-> k+1, x'(k) = A*x(k)
+            # P'(k) = temp1*At + Q
             prediction = kalman.predict()
-            predict_angle = prediction[0, 0]
-            predict_pt = calc_point(predict_angle)
-
-            measurement = kalman.measurementNoiseCov * np.random.randn(1, 1)
 
+            predict_pt = calc_point(prediction[0, 0])  # equivalent to calc_point(kalman.statePre[0,0])
             # generate measurement
+            measurement = kalman.measurementNoiseCov * np.random.randn(1, 1)
             measurement = np.dot(kalman.measurementMatrix, state) + measurement
 
             measurement_angle = measurement[0, 0]
             measurement_pt = calc_point(measurement_angle)
 
-            # plot points
-            def draw_cross(center, color, d):
-                cv.line(img,
-                         (center[0] - d, center[1] - d), (center[0] + d, center[1] + d),
-                         color, 1, cv.LINE_AA, 0)
-                cv.line(img,
-                         (center[0] + d, center[1] - d), (center[0] - d, center[1] + d),
-                         color, 1, cv.LINE_AA, 0)
-
-            img = np.zeros((img_height, img_width, 3), np.uint8)
-            draw_cross(np.int32(state_pt), (255, 255, 255), 3)
-            draw_cross(np.int32(measurement_pt), (0, 0, 255), 3)
-            draw_cross(np.int32(predict_pt), (0, 255, 0), 3)
-
-            cv.line(img, state_pt, measurement_pt, (0, 0, 255), 3, cv.LINE_AA, 0)
-            cv.line(img, state_pt, predict_pt, (0, 255, 255), 3, cv.LINE_AA, 0)
-
+            # correct the state estimates based on measurements
+            # updates statePost & errorCovPost
             kalman.correct(measurement)
+            improved_pt = calc_point(kalman.statePost[0, 0])
 
-            process_noise = sqrt(kalman.processNoiseCov[0,0]) * np.random.randn(2, 1)
-            state = np.dot(kalman.transitionMatrix, state) + process_noise
+            # plot points
+            cv.drawMarker(img, measurement_pt, (0, 0, 255), cv.MARKER_SQUARE, 5, 2)
+            cv.drawMarker(img, predict_pt, (0, 255, 255), cv.MARKER_SQUARE, 5, 2)
+            cv.drawMarker(img, improved_pt, (0, 255, 0), cv.MARKER_SQUARE, 5, 2)
+            cv.drawMarker(img, state_pt, (255, 255, 255), cv.MARKER_STAR, 10, 1)
+            # forecast one step
+            cv.drawMarker(img, calc_point(np.dot(kalman.transitionMatrix, kalman.statePost)[0, 0]),
+                          (255, 255, 0), cv.MARKER_SQUARE, 12, 1)
+
+            cv.line(img, state_pt, measurement_pt, (0, 0, 255), 1, cv.LINE_AA, 0)  # red measurement error
+            cv.line(img, state_pt, predict_pt, (0, 255, 255), 1, cv.LINE_AA, 0)  # yellow pre-meas error
+            cv.line(img, state_pt, improved_pt, (0, 255, 0), 1, cv.LINE_AA, 0)  # green post-meas error
+
+            # update the real process
+            process_noise = sqrt(kalman.processNoiseCov[0, 0]) * np.random.randn(2, 1)
+            state = np.dot(kalman.transitionMatrix, state) + process_noise  # x_k+1 = F x_k + w_k
 
             cv.imshow("Kalman", img)
-
-            code = cv.waitKey(100)
+            code = cv.waitKey(1000)
             if code != -1:
                 break
 
diff --git a/samples/python/lk_homography.py b/samples/python/lk_homography.py
index 808f30965f0d..38a05f63b6a5 100755
--- a/samples/python/lk_homography.py
+++ b/samples/python/lk_homography.py
@@ -77,8 +77,8 @@ def run(self):
 
                 for (x0, y0), (x1, y1), good in zip(self.p0[:,0], self.p1[:,0], status[:,0]):
                     if good:
-                        cv.line(vis, (x0, y0), (x1, y1), (0, 128, 0))
-                    cv.circle(vis, (x1, y1), 2, (red, green)[good], -1)
+                        cv.line(vis, (int(x0), int(y0)), (int(x1), int(y1)), (0, 128, 0))
+                    cv.circle(vis, (int(x1), int(y1)), 2, (red, green)[good], -1)
                 draw_str(vis, (20, 20), 'track count: %d' % len(self.p1))
                 if self.use_ransac:
                     draw_str(vis, (20, 40), 'RANSAC')
@@ -86,7 +86,7 @@ def run(self):
                 p = cv.goodFeaturesToTrack(frame_gray, **feature_params)
                 if p is not None:
                     for x, y in p[:,0]:
-                        cv.circle(vis, (x, y), 2, green, -1)
+                        cv.circle(vis, (int(x), int(y)), 2, green, -1)
                     draw_str(vis, (20, 20), 'feature count: %d' % len(p))
 
             cv.imshow('lk_homography', vis)
diff --git a/samples/python/lk_track.py b/samples/python/lk_track.py
index 7b77f1b33595..97a8c40241e2 100755
--- a/samples/python/lk_track.py
+++ b/samples/python/lk_track.py
@@ -65,7 +65,7 @@ def run(self):
                     if len(tr) > self.track_len:
                         del tr[0]
                     new_tracks.append(tr)
-                    cv.circle(vis, (x, y), 2, (0, 255, 0), -1)
+                    cv.circle(vis, (int(x), int(y)), 2, (0, 255, 0), -1)
                 self.tracks = new_tracks
                 cv.polylines(vis, [np.int32(tr) for tr in self.tracks], False, (0, 255, 0))
                 draw_str(vis, (20, 20), 'track count: %d' % len(self.tracks))
diff --git a/samples/python/morphology.py b/samples/python/morphology.py
index 9ecf5b0682e7..183f5e828815 100755
--- a/samples/python/morphology.py
+++ b/samples/python/morphology.py
@@ -50,8 +50,11 @@ def main():
         cur_str_mode = str_modes.next()
 
     def update(dummy=None):
-        sz = cv.getTrackbarPos('op/size', 'morphology')
-        iters = cv.getTrackbarPos('iters', 'morphology')
+        try: # do not get trackbar position while trackbar is not created
+            sz = cv.getTrackbarPos('op/size', 'morphology')
+            iters = cv.getTrackbarPos('iters', 'morphology')
+        except:
+            return
         opers = cur_mode.split('/')
         if len(opers) > 1:
             sz = sz - 10
diff --git a/samples/python/stitching_detailed.py b/samples/python/stitching_detailed.py
index a7e316105edd..4ee29048d118 100644
--- a/samples/python/stitching_detailed.py
+++ b/samples/python/stitching_detailed.py
@@ -450,7 +450,8 @@ def main():
                 cameras[i].focal *= compose_work_aspect
                 cameras[i].ppx *= compose_work_aspect
                 cameras[i].ppy *= compose_work_aspect
-                sz = (full_img_sizes[i][0] * compose_scale, full_img_sizes[i][1] * compose_scale)
+                sz = (int(round(full_img_sizes[i][0] * compose_scale)),
+                      int(round(full_img_sizes[i][1] * compose_scale)))
                 K = cameras[i].K().astype(np.float32)
                 roi = warper.warpRoi(sz, K, cameras[i].R)
                 corners.append(roi[0:2])
diff --git a/samples/python/video_v4l2.py b/samples/python/video_v4l2.py
index 61b1e3580483..abebb2a2cacc 100644
--- a/samples/python/video_v4l2.py
+++ b/samples/python/video_v4l2.py
@@ -30,7 +30,7 @@ def decode_fourcc(v):
     color = (0, 255, 0)
 
     cap = cv.VideoCapture(0)
-    cap.set(cv.CAP_PROP_AUTOFOCUS, False)  # Known bug: https://github.com/opencv/opencv/pull/5474
+    cap.set(cv.CAP_PROP_AUTOFOCUS, 0)  # Known bug: https://github.com/opencv/opencv/pull/5474
 
     cv.namedWindow("Video")
 
@@ -67,7 +67,7 @@ def decode_fourcc(v):
             break
         elif k == ord('g'):
             convert_rgb = not convert_rgb
-            cap.set(cv.CAP_PROP_CONVERT_RGB, convert_rgb)
+            cap.set(cv.CAP_PROP_CONVERT_RGB, 1 if convert_rgb else 0)
 
     print('Done')
 
diff --git a/samples/semihosting/CMakeLists.txt b/samples/semihosting/CMakeLists.txt
new file mode 100644
index 000000000000..9fddb0587b43
--- /dev/null
+++ b/samples/semihosting/CMakeLists.txt
@@ -0,0 +1,10 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html
+
+set(SEMIHOSTING_SUFFIX semihosting)
+
+add_subdirectory(include)
+set(RAW_PIXEL_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
+add_subdirectory(histogram)
+add_subdirectory(norm)
diff --git a/samples/semihosting/README.md b/samples/semihosting/README.md
new file mode 100644
index 000000000000..881b09b735b8
--- /dev/null
+++ b/samples/semihosting/README.md
@@ -0,0 +1,27 @@
+# Arm semihosting
+
+This folder contain a toolchain file and a couple of examples for
+building OpenCV based applications that can run in an [Arm
+semihosting](https://developer.arm.com/documentation/100863/latest)
+setup.
+
+OpenCV can be compiled to target a semihosting platform as follows:
+
+```
+cmake ../opencv/ \
+    -DCMAKE_TOOLCHAIN_FILE=../opencv/platforms/semihosting/aarch64-semihosting.toolchain.cmake \
+    -DSEMIHOSTING_TOOLCHAIN_PATH=/path/to/baremetal-toolchain/bin/ \
+    -DBUILD_EXAMPLES=ON -GNinja
+```
+
+A barematel toolchain for targeting aarch64 semihosting can be found
+[here](https://developer.arm.com/tools-and-software/open-source-software/developer-tools/gnu-toolchain/gnu-a/downloads),
+under `aarch64-none-elf`.
+
+The code of the examples in the `norm` and `histogram` folders can be
+executed with qemu in Linux userspace:
+
+```
+    qemu-aarch64 ./bin/example_semihosting_histogram
+    qemu-aarch64 ./bin/example_semihosting_norm
+```
diff --git a/samples/semihosting/histogram/CMakeLists.txt b/samples/semihosting/histogram/CMakeLists.txt
new file mode 100644
index 000000000000..d2f065d1b9c8
--- /dev/null
+++ b/samples/semihosting/histogram/CMakeLists.txt
@@ -0,0 +1,26 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html
+
+set(PROJECT_NAME histogram)
+project(${PROJECT_NAME})
+
+ocv_install_example_src(histogram *.cpp *.hpp CMakeLists.txt)
+
+set(LOCAL_DEPS
+  opencv_core
+  opencv_imgproc
+  ${OPENCV_MODULES_PUBLIC}
+  ${OpenCV_LIB_COMPONENTS})
+ocv_check_dependencies(${LOCAL_DEPS})
+
+if(NOT OCV_DEPENDENCIES_FOUND)
+  return()
+endif()
+
+ocv_define_sample(histogram histogram.cpp ${SEMIHOSTING_SUFFIX})
+ocv_include_modules_recurse(${LOCAL_DEPS})
+target_include_directories(${histogram} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(${histogram} PRIVATE ${RAW_PIXEL_INCLUDE})
+ocv_target_link_libraries(${histogram} PRIVATE ${OPENCV_LINKER_LIBS}
+  ${LOCAL_DEPS})
diff --git a/samples/semihosting/histogram/histogram.cpp b/samples/semihosting/histogram/histogram.cpp
new file mode 100644
index 000000000000..daa568d0bbb0
--- /dev/null
+++ b/samples/semihosting/histogram/histogram.cpp
@@ -0,0 +1,43 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/imgcodecs.hpp>
+
+#include <cstdint>
+#include <array>
+#include <iostream>
+#include "raw_pixels.hpp"
+
+#define IMG_ROWS 100
+#define IMG_COLS 100
+
+static_assert(IMG_ROWS * IMG_COLS <= RAW_PIXELS_SIZE, "Incompatible size");
+
+int main(void)
+{
+    // Number of experiment runs
+    int no_runs = 2;
+
+    // https://docs.opencv.org/master/d3/d63/classcv_1_1Mat.html
+    cv::Mat src_new(IMG_ROWS, IMG_COLS, CV_8UC1, (void *)raw_pixels);
+
+    // Set parameters
+    int imgCount = 1;
+    const int channels[] = {0};
+    cv::Mat mask = cv::Mat();
+    cv::Mat hist;
+    int dims = 1;
+    const int hist_sizes[] = {256};
+    float Range[] = {0,256};
+    const float *ranges[] = {Range};
+
+    // Run calc Hist
+    for(int i=0; i < no_runs; i++){
+        std::cout << "Running iteration # "<< i << std::endl;
+        cv::calcHist(&src_new, imgCount, channels, mask, hist, dims, hist_sizes, ranges);
+    }
+
+    return 0;
+}
diff --git a/samples/semihosting/include/CMakeLists.txt b/samples/semihosting/include/CMakeLists.txt
new file mode 100644
index 000000000000..3c429b8adf6c
--- /dev/null
+++ b/samples/semihosting/include/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Populate a C array with random data.
+set(RAW_PIXELS_SIZE 102400)
+set(RAW_PIXELS_HEADER ${CMAKE_CURRENT_BINARY_DIR}/raw_pixels.hpp)
+set(RAW_PIXELS_HEADER_IN ${CMAKE_CURRENT_SOURCE_DIR}/raw_pixels.hpp.in)
+
+set(RAW_PIXEL_VALUES "")
+# Seed the random number generator.
+string(RANDOM LENGTH 8 ALPHABET 0123456789abcdf RANDOM_SEED 314 number)
+math(EXPR LOOP_RANGE "${RAW_PIXELS_SIZE} - 1")
+
+foreach(i RANGE ${LOOP_RANGE})
+  string(RANDOM LENGTH 8 ALPHABET 0123456789abcdf number)
+  string(CONCAT RAW_PIXEL_VALUES ${RAW_PIXEL_VALUES} "0x${number}, \\\n")
+endforeach()
+
+configure_file(${RAW_PIXELS_HEADER_IN} ${RAW_PIXELS_HEADER})
diff --git a/samples/semihosting/include/raw_pixels.hpp.in b/samples/semihosting/include/raw_pixels.hpp.in
new file mode 100644
index 000000000000..6ee98222cc1b
--- /dev/null
+++ b/samples/semihosting/include/raw_pixels.hpp.in
@@ -0,0 +1,11 @@
+#ifndef RAW_PIXELS_HPP
+#define RAW_PIXELS_HP
+#include <cstdint>
+
+#cmakedefine RAW_PIXEL_VALUES @RAW_PIXEL_VALUES@
+#cmakedefine RAW_PIXELS_SIZE @RAW_PIXELS_SIZE@
+
+static std::uint32_t raw_pixels[RAW_PIXELS_SIZE] = {
+    RAW_PIXEL_VALUES
+};
+#endif //RAW_PIXELS_HPP
diff --git a/samples/semihosting/norm/CMakeLists.txt b/samples/semihosting/norm/CMakeLists.txt
new file mode 100644
index 000000000000..6f23d74627d2
--- /dev/null
+++ b/samples/semihosting/norm/CMakeLists.txt
@@ -0,0 +1,25 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html
+
+set(PROJECT_NAME norm)
+project(${PROJECT_NAME})
+
+ocv_install_example_src(norm *.cpp *.hpp CMakeLists.txt)
+
+set(LOCAL_DEPS
+  opencv_core
+  ${OPENCV_MODULES_PUBLIC}
+  ${OpenCV_LIB_COMPONENTS})
+ocv_check_dependencies(${LOCAL_DEPS})
+
+if(NOT OCV_DEPENDENCIES_FOUND)
+  return()
+endif()
+
+ocv_define_sample(norm norm.cpp ${SEMIHOSTING_SUFFIX})
+ocv_include_modules_recurse(${LOCAL_DEPS})
+target_include_directories(${norm} PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(${norm} PRIVATE ${RAW_PIXEL_INCLUDE})
+ocv_target_link_libraries(${norm} PRIVATE ${OPENCV_LINKER_LIBS}
+  ${LOCAL_DEPS})
diff --git a/samples/semihosting/norm/norm.cpp b/samples/semihosting/norm/norm.cpp
new file mode 100644
index 000000000000..f911754be132
--- /dev/null
+++ b/samples/semihosting/norm/norm.cpp
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+
+#include <cstdint>
+#include <array>
+#include <iostream>
+#include "raw_pixels.hpp"
+
+#define IMG_ROWS 100
+#define IMG_COLS 100
+
+static_assert(IMG_ROWS * IMG_COLS <= RAW_PIXELS_SIZE, "Incompatible size");
+
+int main(void)
+{
+    // Number of experiment runs
+    int no_runs = 2;
+
+    // https://docs.opencv.org/master/d3/d63/classcv_1_1Mat.html
+    cv::Mat src(IMG_ROWS, IMG_COLS, CV_8UC1, (void *)raw_pixels);
+
+    // Run calc Hist
+    for(int i=0; i < no_runs; i++){
+        std::cout << "Running iteration # "<< i << std::endl;
+        cv::norm(src);
+    }
+
+    return 0;
+}