diff --git a/3rdParty/opencv2/README.md b/3rdParty/opencv2/README.md
deleted file mode 100644
index 6f908e05a1..0000000000
--- a/3rdParty/opencv2/README.md
+++ /dev/null
@@ -1 +0,0 @@
-These includes are taken from [OpenCV 4.6.0](https://github.com/opencv/opencv).
diff --git a/3rdParty/opencv2/calib3d.hpp b/3rdParty/opencv2/calib3d.hpp
index 4c125d7345..a0245b81b1 100644
--- a/3rdParty/opencv2/calib3d.hpp
+++ b/3rdParty/opencv2/calib3d.hpp
@@ -48,6 +48,7 @@
 #include "opencv2/core/types.hpp"
 #include "opencv2/features2d.hpp"
 #include "opencv2/core/affine.hpp"
+#include "opencv2/core/utils/logger.hpp"
 
 /**
   @defgroup calib3d Camera Calibration and 3D Reconstruction
@@ -411,11 +412,11 @@ R & t \\
     where R is the rotation matrix corresponding to the rotation vector om: R = rodrigues(om); call x, y
     and z the 3 coordinates of Xc:
 
-    \f[x = Xc_1 \\ y = Xc_2 \\ z = Xc_3\f]
+    \f[\begin{array}{l} x = Xc_1 \\ y = Xc_2 \\ z = Xc_3 \end{array} \f]
 
     The pinhole projection coordinates of P is [a; b] where
 
-    \f[a = x / z \ and \ b = y / z \\ r^2 = a^2 + b^2 \\ \theta = atan(r)\f]
+    \f[\begin{array}{l} a = x / z \ and \ b = y / z \\ r^2 = a^2 + b^2 \\ \theta = atan(r) \end{array} \f]
 
     Fisheye distortion:
 
@@ -423,18 +424,16 @@ R & t \\
 
     The distorted point coordinates are [x'; y'] where
 
-    \f[x' = (\theta_d / r) a \\ y' = (\theta_d / r) b \f]
+    \f[\begin{array}{l} x' = (\theta_d / r) a \\ y' = (\theta_d / r) b \end{array} \f]
 
     Finally, conversion into pixel coordinates: The final pixel coordinates vector [u; v] where:
 
-    \f[u = f_x (x' + \alpha y') + c_x \\
-    v = f_y y' + c_y\f]
+    \f[\begin{array}{l} u = f_x (x' + \alpha y') + c_x \\
+    v = f_y y' + c_y \end{array} \f]
 
     Summary:
     Generic camera model @cite Kannala2006 with perspective projection and without distortion correction
 
-    @defgroup calib3d_c C API
-
   @}
  */
 
@@ -490,7 +489,8 @@ enum { CALIB_CB_ADAPTIVE_THRESH = 1,
        CALIB_CB_EXHAUSTIVE      = 16,
        CALIB_CB_ACCURACY        = 32,
        CALIB_CB_LARGER          = 64,
-       CALIB_CB_MARKER          = 128
+       CALIB_CB_MARKER          = 128,
+       CALIB_CB_PLAIN           = 256
      };
 
 enum { CALIB_CB_SYMMETRIC_GRID  = 1,
@@ -548,12 +548,13 @@ enum RobotWorldHandEyeCalibrationMethod
     CALIB_ROBOT_WORLD_HAND_EYE_LI   = 1  //!< Simultaneous robot-world and hand-eye calibration using dual-quaternions and kronecker product @cite Li2010SimultaneousRA
 };
 
-enum SamplingMethod { SAMPLING_UNIFORM, SAMPLING_PROGRESSIVE_NAPSAC, SAMPLING_NAPSAC,
-        SAMPLING_PROSAC };
-enum LocalOptimMethod {LOCAL_OPTIM_NULL, LOCAL_OPTIM_INNER_LO, LOCAL_OPTIM_INNER_AND_ITER_LO,
-        LOCAL_OPTIM_GC, LOCAL_OPTIM_SIGMA};
-enum ScoreMethod {SCORE_METHOD_RANSAC, SCORE_METHOD_MSAC, SCORE_METHOD_MAGSAC, SCORE_METHOD_LMEDS};
-enum NeighborSearchMethod { NEIGH_FLANN_KNN, NEIGH_GRID, NEIGH_FLANN_RADIUS };
+enum SamplingMethod { SAMPLING_UNIFORM=0, SAMPLING_PROGRESSIVE_NAPSAC=1, SAMPLING_NAPSAC=2,
+        SAMPLING_PROSAC=3 };
+enum LocalOptimMethod {LOCAL_OPTIM_NULL=0, LOCAL_OPTIM_INNER_LO=1, LOCAL_OPTIM_INNER_AND_ITER_LO=2,
+        LOCAL_OPTIM_GC=3, LOCAL_OPTIM_SIGMA=4};
+enum ScoreMethod {SCORE_METHOD_RANSAC=0, SCORE_METHOD_MSAC=1, SCORE_METHOD_MAGSAC=2, SCORE_METHOD_LMEDS=3};
+enum NeighborSearchMethod { NEIGH_FLANN_KNN=0, NEIGH_GRID=1, NEIGH_FLANN_RADIUS=2 };
+enum PolishingMethod { NONE_POLISHER=0, LSQ_POLISHER=1, MAGSAC=2, COV_POLISHER=3 };
 
 struct CV_EXPORTS_W_SIMPLE UsacParams
 { // in alphabetical order
@@ -569,6 +570,8 @@ struct CV_EXPORTS_W_SIMPLE UsacParams
     CV_PROP_RW SamplingMethod sampler;
     CV_PROP_RW ScoreMethod score;
     CV_PROP_RW double threshold;
+    CV_PROP_RW PolishingMethod final_polisher;
+    CV_PROP_RW int final_polisher_iterations;
 };
 
 /** @brief Converts a rotation matrix to a rotation vector or vice versa.
@@ -619,7 +622,7 @@ class CV_EXPORTS LMSolver : public Algorithm
 
          @param param the current vector of parameters
          @param err output vector of errors: err_i = actual_f_i - ideal_f_i
-         @param J output Jacobian: J_ij = d(err_i)/d(param_j)
+         @param J output Jacobian: J_ij = d(ideal_f_i)/d(param_j)
 
          when J=noArray(), it means that it does not need to be computed.
          Dimensionality of error vector and param vector can be different.
@@ -685,7 +688,7 @@ a vector\<Point2f\> .
 -   @ref RHO - PROSAC-based robust method
 @param ransacReprojThreshold Maximum allowed reprojection error to treat a point pair as an inlier
 (used in the RANSAC and RHO methods only). That is, if
-\f[\| \texttt{dstPoints} _i -  \texttt{convertPointsHomogeneous} ( \texttt{H} * \texttt{srcPoints} _i) \|_2  >  \texttt{ransacReprojThreshold}\f]
+\f[\| \texttt{dstPoints} _i -  \texttt{convertPointsHomogeneous} ( \texttt{H} \cdot \texttt{srcPoints} _i) \|_2  >  \texttt{ransacReprojThreshold}\f]
 then the point \f$i\f$ is considered as an outlier. If srcPoints and dstPoints are measured in pixels,
 it usually makes sense to set this parameter somewhere in the range of 1 to 10.
 @param mask Optional output mask set by a robust method ( RANSAC or LMeDS ). Note that the input
@@ -724,8 +727,8 @@ correctly only when there are more than 50% of inliers. Finally, if there are no
 noise is rather small, use the default method (method=0).
 
 The function is used to find initial intrinsic and extrinsic matrices. Homography matrix is
-determined up to a scale. Thus, it is normalized so that \f$h_{33}=1\f$. Note that whenever an \f$H\f$ matrix
-cannot be estimated, an empty one will be returned.
+determined up to a scale. If \f$h_{33}\f$ is non-zero, the matrix is normalized so that \f$h_{33}=1\f$.
+@note Whenever an \f$H\f$ matrix cannot be estimated, an empty one will be returned.
 
 @sa
 getAffineTransform, estimateAffine2D, estimateAffinePartial2D, getPerspectiveTransform, warpPerspective,
@@ -760,7 +763,7 @@ and a rotation matrix.
 It optionally returns three rotation matrices, one for each axis, and the three Euler angles in
 degrees (as the return value) that could be used in OpenGL. Note, there is always more than one
 sequence of rotations about the three principal axes that results in the same orientation of an
-object, e.g. see @cite Slabaugh . Returned tree rotation matrices and corresponding three Euler angles
+object, e.g. see @cite Slabaugh . Returned three rotation matrices and corresponding three Euler angles
 are only one of the possible solutions.
  */
 CV_EXPORTS_W Vec3d RQDecomp3x3( InputArray src, OutputArray mtxR, OutputArray mtxQ,
@@ -786,9 +789,9 @@ matrix and the position of a camera.
 It optionally returns three rotation matrices, one for each axis, and three Euler angles that could
 be used in OpenGL. Note, there is always more than one sequence of rotations about the three
 principal axes that results in the same orientation of an object, e.g. see @cite Slabaugh . Returned
-tree rotation matrices and corresponding three Euler angles are only one of the possible solutions.
+three rotation matrices and corresponding three Euler angles are only one of the possible solutions.
 
-The function is based on RQDecomp3x3 .
+The function is based on #RQDecomp3x3 .
  */
 CV_EXPORTS_W void decomposeProjectionMatrix( InputArray projMatrix, OutputArray cameraMatrix,
                                              OutputArray rotMatrix, OutputArray transVect,
@@ -834,10 +837,10 @@ The functions compute:
 \f[\begin{array}{l} \texttt{rvec3} =  \mathrm{rodrigues} ^{-1} \left ( \mathrm{rodrigues} ( \texttt{rvec2} )  \cdot \mathrm{rodrigues} ( \texttt{rvec1} ) \right )  \\ \texttt{tvec3} =  \mathrm{rodrigues} ( \texttt{rvec2} )  \cdot \texttt{tvec1} +  \texttt{tvec2} \end{array} ,\f]
 
 where \f$\mathrm{rodrigues}\f$ denotes a rotation vector to a rotation matrix transformation, and
-\f$\mathrm{rodrigues}^{-1}\f$ denotes the inverse transformation. See Rodrigues for details.
+\f$\mathrm{rodrigues}^{-1}\f$ denotes the inverse transformation. See #Rodrigues for details.
 
 Also, the functions can compute the derivatives of the output vectors with regards to the input
-vectors (see matMulDeriv ). The functions are used inside #stereoCalibrate but can also be used in
+vectors (see #matMulDeriv ). The functions are used inside #stereoCalibrate but can also be used in
 your own code where Levenberg-Marquardt or another gradient-based solver is used to optimize a
 function that contains a matrix multiplication.
  */
@@ -1206,7 +1209,7 @@ coordinate space. In the old interface all the per-view vectors are concatenated
 old interface all the per-view vectors are concatenated.
 @param imageSize Image size in pixels used to initialize the principal point.
 @param aspectRatio If it is zero or negative, both \f$f_x\f$ and \f$f_y\f$ are estimated independently.
-Otherwise, \f$f_x = f_y * \texttt{aspectRatio}\f$ .
+Otherwise, \f$f_x = f_y \cdot \texttt{aspectRatio}\f$ .
 
 The function estimates and returns an initial camera intrinsic matrix for the camera calibration process.
 Currently, the function only supports planar calibration patterns, which are patterns where each
@@ -1225,13 +1228,17 @@ CV_EXPORTS_W Mat initCameraMatrix2D( InputArrayOfArrays objectPoints,
 @param flags Various operation flags that can be zero or a combination of the following values:
 -   @ref CALIB_CB_ADAPTIVE_THRESH Use adaptive thresholding to convert the image to black
 and white, rather than a fixed threshold level (computed from the average image brightness).
--   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with equalizeHist before
+-   @ref CALIB_CB_NORMALIZE_IMAGE Normalize the image gamma with #equalizeHist before
 applying fixed or adaptive thresholding.
 -   @ref CALIB_CB_FILTER_QUADS Use additional criteria (like contour area, perimeter,
 square-like shape) to filter out false quads extracted at the contour retrieval stage.
 -   @ref CALIB_CB_FAST_CHECK Run a fast check on the image that looks for chessboard corners,
 and shortcut the call if none is found. This can drastically speed up the call in the
 degenerate condition when no chessboard is observed.
+-   @ref CALIB_CB_PLAIN All other flags are ignored. The input image is taken as is.
+No image processing is done to improve to find the checkerboard. This has the effect of speeding up the
+execution of the function but could lead to not recognizing the checkerboard if the image
+is not previously binarized in the appropriate manner.
 
 The function attempts to determine whether the input image is a view of the chessboard pattern and
 locate the internal chessboard corners. The function returns a non-zero value if all of the corners
@@ -1239,7 +1246,7 @@ are found and they are placed in a certain order (row by row, left to right in e
 Otherwise, if the function fails to find all the corners or reorder them, it returns 0. For example,
 a regular chessboard has 8 x 8 squares and 7 x 7 internal corners, that is, points where the black
 squares touch each other. The detected coordinates are approximate, and to determine their positions
-more accurately, the function calls cornerSubPix. You also may use the function cornerSubPix with
+more accurately, the function calls #cornerSubPix. You also may use the function #cornerSubPix with
 different parameters if returned coordinates are not accurate enough.
 
 Sample usage of detecting and drawing chessboard corners: :
@@ -1592,6 +1599,10 @@ The algorithm performs the following steps:
     \f$f_y\f$ (ratios of 10:1 or more)), then you are probably using patternSize=cvSize(rows,cols)
     instead of using patternSize=cvSize(cols,rows) in @ref findChessboardCorners.
 
+@note
+    The function may throw exceptions, if unsupported combination of parameters is provided or
+    the system is underconstrained.
+
 @sa
    calibrateCameraRO, findChessboardCorners, solvePnP, initCameraMatrix2D, stereoCalibrate,
    undistort
@@ -1748,6 +1759,15 @@ second camera coordinate system.
 @param T Output translation vector, see description above.
 @param E Output essential matrix.
 @param F Output fundamental matrix.
+@param rvecs Output vector of rotation vectors ( @ref Rodrigues ) estimated for each pattern view in the
+coordinate system of the first camera of the stereo pair (e.g. std::vector<cv::Mat>). More in detail, each
+i-th rotation vector together with the corresponding i-th translation vector (see the next output parameter
+description) brings the calibration pattern from the object coordinate space (in which object points are
+specified) to the camera coordinate space of the first camera of the stereo pair. In more technical terms,
+the tuple of the i-th rotation and translation vector performs a change of basis from object coordinate space
+to camera coordinate space of the first camera of the stereo pair.
+@param tvecs Output vector of translation vectors estimated for each pattern view, see parameter description
+of previous output parameter ( rvecs ).
 @param perViewErrors Output vector of the RMS re-projection error estimated for each pattern view.
 @param flags Different flags that may be zero or a combination of the following values:
 -   @ref CALIB_FIX_INTRINSIC Fix cameraMatrix? and distCoeffs? so that only R, T, E, and F
@@ -1844,8 +1864,8 @@ CV_EXPORTS_AS(stereoCalibrateExtended) double stereoCalibrate( InputArrayOfArray
                                      InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
                                      InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
                                      InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
-                                     Size imageSize, InputOutputArray R,InputOutputArray T, OutputArray E, OutputArray F,
-                                     OutputArray perViewErrors, int flags = CALIB_FIX_INTRINSIC,
+                                     Size imageSize, InputOutputArray R, InputOutputArray T, OutputArray E, OutputArray F,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, OutputArray perViewErrors, int flags = CALIB_FIX_INTRINSIC,
                                      TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
 
 /// @overload
@@ -1857,6 +1877,15 @@ CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
                                      int flags = CALIB_FIX_INTRINSIC,
                                      TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
 
+/// @overload
+CV_EXPORTS_W double stereoCalibrate( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                     InputOutputArray cameraMatrix1, InputOutputArray distCoeffs1,
+                                     InputOutputArray cameraMatrix2, InputOutputArray distCoeffs2,
+                                     Size imageSize, InputOutputArray R, InputOutputArray T, OutputArray E, OutputArray F,
+                                     OutputArray perViewErrors, int flags = CALIB_FIX_INTRINSIC,
+                                     TermCriteria criteria = TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 30, 1e-6) );
+
 /** @brief Computes rectification transforms for each head of a calibrated stereo camera.
 
 @param cameraMatrix1 First camera intrinsic matrix.
@@ -1924,11 +1953,18 @@ coordinates. The function distinguishes the following two cases:
                      \end{bmatrix}\f]
 
     \f[\texttt{P2} = \begin{bmatrix}
-                        f & 0 & cx_2 & T_x*f \\
+                        f & 0 & cx_2 & T_x \cdot f \\
                         0 & f & cy & 0 \\
                         0 & 0 & 1 & 0
                      \end{bmatrix} ,\f]
 
+    \f[\texttt{Q} = \begin{bmatrix}
+                        1 & 0 & 0 & -cx_1 \\
+                        0 & 1 & 0 & -cy \\
+                        0 & 0 & 0 & f \\
+                        0 & 0 & -\frac{1}{T_x} & \frac{cx_1 - cx_2}{T_x}
+                    \end{bmatrix} \f]
+
     where \f$T_x\f$ is a horizontal shift between the cameras and \f$cx_1=cx_2\f$ if
     @ref CALIB_ZERO_DISPARITY is set.
 
@@ -1944,10 +1980,17 @@ coordinates. The function distinguishes the following two cases:
 
     \f[\texttt{P2} = \begin{bmatrix}
                         f & 0 & cx & 0 \\
-                        0 & f & cy_2 & T_y*f \\
+                        0 & f & cy_2 & T_y \cdot f \\
                         0 & 0 & 1 & 0
                      \end{bmatrix},\f]
 
+    \f[\texttt{Q} = \begin{bmatrix}
+                        1 & 0 & 0 & -cx \\
+                        0 & 1 & 0 & -cy_1 \\
+                        0 & 0 & 0 & f \\
+                        0 & 0 & -\frac{1}{T_y} & \frac{cy_1 - cy_2}{T_y}
+                    \end{bmatrix} \f]
+
     where \f$T_y\f$ is a vertical shift between the cameras and \f$cy_1=cy_2\f$ if
     @ref CALIB_ZERO_DISPARITY is set.
 
@@ -1983,8 +2026,8 @@ CV_EXPORTS_W void stereoRectify( InputArray cameraMatrix1, InputArray distCoeffs
 @param H2 Output rectification homography matrix for the second image.
 @param threshold Optional threshold used to filter out the outliers. If the parameter is greater
 than zero, all the point pairs that do not comply with the epipolar geometry (that is, the points
-for which \f$|\texttt{points2[i]}^T*\texttt{F}*\texttt{points1[i]}|>\texttt{threshold}\f$ ) are
-rejected prior to computing the homographies. Otherwise, all the points are considered inliers.
+for which \f$|\texttt{points2[i]}^T \cdot \texttt{F} \cdot \texttt{points1[i]}|>\texttt{threshold}\f$ )
+are rejected prior to computing the homographies. Otherwise, all the points are considered inliers.
 
 The function computes the rectification transformations without knowing intrinsic parameters of the
 cameras and their relative position in the space, which explains the suffix "uncalibrated". Another
@@ -2407,7 +2450,7 @@ the found fundamental matrix. Normally just one matrix is found. But in case of
 algorithm, the function may return up to 3 solutions ( \f$9 \times 3\f$ matrix that stores all 3
 matrices sequentially).
 
-The calculated fundamental matrix may be passed further to computeCorrespondEpilines that finds the
+The calculated fundamental matrix may be passed further to #computeCorrespondEpilines that finds the
 epipolar lines corresponding to the specified points. It can also be passed to
 #stereoRectifyUncalibrated to compute the rectification transformation. :
 @code
@@ -2450,13 +2493,13 @@ CV_EXPORTS_W Mat findFundamentalMat( InputArray points1, InputArray points2,
 
 @param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
 be floating-point (single or double precision).
-@param points2 Array of the second image points of the same size and format as points1 .
+@param points2 Array of the second image points of the same size and format as points1.
 @param cameraMatrix Camera intrinsic matrix \f$\cameramatrix{A}\f$ .
 Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera intrinsic matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera intrinsic matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
+same camera intrinsic matrix. If this assumption does not hold for your use case, use another
+function overload or #undistortPoints with `P = cv::NoArray()` for both cameras to transform image
+points to normalized image coordinates, which are valid for the identity camera intrinsic matrix.
+When passing these coordinates, pass the identity matrix for this parameter.
 @param method Method for computing an essential matrix.
 -   @ref RANSAC for the RANSAC algorithm.
 -   @ref LMEDS for the LMedS algorithm.
@@ -2477,7 +2520,7 @@ This function estimates essential matrix based on the five-point algorithm solve
 
 where \f$E\f$ is an essential matrix, \f$p_1\f$ and \f$p_2\f$ are corresponding points in the first and the
 second images, respectively. The result of this function may be passed further to
-#decomposeEssentialMat or  #recoverPose to recover the relative pose between cameras.
+#decomposeEssentialMat or #recoverPose to recover the relative pose between cameras.
  */
 CV_EXPORTS_W
 Mat findEssentialMat(
@@ -2548,23 +2591,13 @@ Mat findEssentialMat(
 
 @param points1 Array of N (N \>= 5) 2D points from the first image. The point coordinates should
 be floating-point (single or double precision).
-@param points2 Array of the second image points of the same size and format as points1 .
-@param cameraMatrix1 Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
-Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
-@param cameraMatrix2 Camera matrix \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
-Note that this function assumes that points1 and points2 are feature points from cameras with the
-same camera matrix. If this assumption does not hold for your use case, use
-#undistortPoints with `P = cv::NoArray()` for both cameras to transform image points
-to normalized image coordinates, which are valid for the identity camera matrix. When
-passing these coordinates, pass the identity matrix for this parameter.
-@param distCoeffs1 Input vector of distortion coefficients
+@param points2 Array of the second image points of the same size and format as points1.
+@param cameraMatrix1 Camera matrix for the first camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param cameraMatrix2 Camera matrix for the second camera \f$K = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$ .
+@param distCoeffs1 Input vector of distortion coefficients for the first camera
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
-@param distCoeffs2 Input vector of distortion coefficients
+@param distCoeffs2 Input vector of distortion coefficients for the second camera
 \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6[, s_1, s_2, s_3, s_4[, \tau_x, \tau_y]]]])\f$
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
 @param method Method for computing an essential matrix.
@@ -2692,7 +2725,7 @@ CV_EXPORTS_W int recoverPose( InputArray points1, InputArray points2,
                             InputOutputArray mask = noArray());
 
 /** @brief Recovers the relative camera rotation and the translation from an estimated essential
-matrix and the corresponding points in two images, using cheirality check. Returns the number of
+matrix and the corresponding points in two images, using chirality check. Returns the number of
 inliers that pass the check.
 
 @param E The input essential matrix.
@@ -2710,11 +2743,11 @@ described below.
 therefore is only known up to scale, i.e. t is the direction of the translation vector and has unit
 length.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
 
 This function decomposes an essential matrix using @ref decomposeEssentialMat and then verifies
-possible pose hypotheses by doing cheirality check. The cheirality check means that the
+possible pose hypotheses by doing chirality check. The chirality check means that the
 triangulated 3D points should have positive depth. Some details can be found in @cite Nister03.
 
 This function can be used to process the output E and mask from @ref findEssentialMat. In this
@@ -2761,8 +2794,8 @@ length.
 are feature points from cameras with same focal length and principal point.
 @param pp principal point of the camera.
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
 
 This function differs from the one above that it computes camera intrinsic matrix from focal length and
 principal point:
@@ -2797,12 +2830,12 @@ length.
 @param distanceThresh threshold distance which is used to filter out far away points (i.e. infinite
 points).
 @param mask Input/output mask for inliers in points1 and points2. If it is not empty, then it marks
-inliers in points1 and points2 for then given essential matrix E. Only these inliers will be used to
-recover pose. In the output mask only inliers which pass the cheirality check.
+inliers in points1 and points2 for the given essential matrix E. Only these inliers will be used to
+recover pose. In the output mask only inliers which pass the chirality check.
 @param triangulatedPoints 3D points which were reconstructed by triangulation.
 
 This function differs from the one above that it outputs the triangulated 3D point that are used for
-the cheirality check.
+the chirality check.
  */
 CV_EXPORTS_W int recoverPose( InputArray E, InputArray points1, InputArray points2,
                             InputArray cameraMatrix, OutputArray R, OutputArray t, double distanceThresh, InputOutputArray mask = noArray(),
@@ -2870,12 +2903,12 @@ CV_EXPORTS_W void triangulatePoints( InputArray projMatr1, InputArray projMatr2,
 @param newPoints1 The optimized points1.
 @param newPoints2 The optimized points2.
 
-The function implements the Optimal Triangulation Method (see Multiple View Geometry for details).
+The function implements the Optimal Triangulation Method (see Multiple View Geometry @cite HartleyZ00 for details).
 For each given point correspondence points1[i] \<-\> points2[i], and a fundamental matrix F, it
 computes the corrected correspondences newPoints1[i] \<-\> newPoints2[i] that minimize the geometric
 error \f$d(points1[i], newPoints1[i])^2 + d(points2[i],newPoints2[i])^2\f$ (where \f$d(a,b)\f$ is the
 geometric distance between points \f$a\f$ and \f$b\f$ ) subject to the epipolar constraint
-\f$newPoints2^T * F * newPoints1 = 0\f$ .
+\f$newPoints2^T \cdot F \cdot newPoints1 = 0\f$ .
  */
 CV_EXPORTS_W void correctMatches( InputArray F, InputArray points1, InputArray points2,
                                   OutputArray newPoints1, OutputArray newPoints2 );
@@ -2939,7 +2972,7 @@ W
 x \\
 y \\
 \texttt{disparity} (x,y) \\
-z
+1
 \end{bmatrix}.\f]
 
 @sa
@@ -3230,7 +3263,7 @@ Check @ref tutorial_homography "the corresponding tutorial" for more details.
 
 This function extracts relative camera motion between two views of a planar object and returns up to
 four mathematical solution tuples of rotation, translation, and plane normal. The decomposition of
-the homography matrix H is described in detail in @cite Malis.
+the homography matrix H is described in detail in @cite Malis2007.
 
 If the homography H, induced by the plane, gives the constraint
 \f[s_i \vecthree{x'_i}{y'_i}{1} \sim H \vecthree{x_i}{y_i}{1}\f] on the source image points
@@ -3258,7 +3291,7 @@ CV_EXPORTS_W int decomposeHomographyMat(InputArray H,
 @param pointsMask optional Mat/Vector of 8u type representing the mask for the inliers as given by the #findHomography function
 
 This function is intended to filter the output of the #decomposeHomographyMat based on additional
-information as described in @cite Malis . The summary of the method: the #decomposeHomographyMat function
+information as described in @cite Malis2007 . The summary of the method: the #decomposeHomographyMat function
 returns 2 unique solutions and their "opposites" for a total of 4 solutions. If we have access to the
 sets of points visible in the camera frame before and after the homography transformation is applied,
 we can determine which are the true potential solutions and which are the opposites by verifying which
@@ -3548,7 +3581,7 @@ where cameraMatrix can be chosen arbitrarily.
 of 4, 5, 8, 12 or 14 elements. If the vector is NULL/empty, the zero distortion coefficients are assumed.
 @param R Optional rectification transformation in the object space (3x3 matrix). R1 or R2 ,
 computed by #stereoRectify can be passed here. If the matrix is empty, the identity transformation
-is assumed. In cvInitUndistortMap R assumed to be an identity matrix.
+is assumed. In #initUndistortRectifyMap R assumed to be an identity matrix.
 @param newCameraMatrix New camera matrix \f$A'=\vecthreethree{f_x'}{0}{c_x'}{0}{f_y'}{c_y'}{0}{0}{1}\f$.
 @param size Undistorted image size.
 @param m1type Type of the first output map that can be CV_32FC1, CV_32FC2 or CV_16SC2, see #convertMaps
@@ -3772,7 +3805,7 @@ namespace fisheye
     @param imagePoints Output array of image points, 2xN/Nx2 1-channel or 1xN/Nx1 2-channel, or
     vector\<Point2f\>.
     @param affine
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param alpha The skew coefficient.
     @param jacobian Optional output 2Nx15 jacobian matrix of derivatives of image points with respect
@@ -3796,21 +3829,36 @@ namespace fisheye
 
     @param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
     the number of points in the view.
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param alpha The skew coefficient.
     @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
 
     Note that the function assumes the camera intrinsic matrix of the undistorted points to be identity.
-    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$.
+    This means if you want to distort image points you have to multiply them with \f$K^{-1}\f$ or
+    use another function overload.
      */
     CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray K, InputArray D, double alpha = 0);
 
+    /** @overload
+    Overload of distortPoints function to handle cases when undistorted points are obtained with non-identity
+    camera matrix, e.g. output of #estimateNewCameraMatrixForUndistortRectify.
+    @param undistorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is
+    the number of points in the view.
+    @param Kundistorted Camera intrinsic matrix used as new camera matrix for undistortion.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
+    @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
+    @param alpha The skew coefficient.
+    @param distorted Output array of image points, 1xN/Nx1 2-channel, or vector\<Point2f\> .
+    @sa estimateNewCameraMatrixForUndistortRectify
+    */
+    CV_EXPORTS_W void distortPoints(InputArray undistorted, OutputArray distorted, InputArray Kundistorted, InputArray K, InputArray D, double alpha = 0);
+
     /** @brief Undistorts 2D points using fisheye model
 
     @param distorted Array of object points, 1xN/Nx1 2-channel (or vector\<Point2f\> ), where N is the
     number of points in the view.
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
     1-channel or 1x1 3-channel
@@ -3825,7 +3873,7 @@ namespace fisheye
     /** @brief Computes undistortion and rectification maps for image transform by #remap. If D is empty zero
     distortion is used, if R or P is empty identity matrixes are used.
 
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
     1-channel or 1x1 3-channel
@@ -3843,7 +3891,7 @@ namespace fisheye
 
     @param distorted image with fisheye lens distortion.
     @param undistorted Output image with compensated fisheye lens distortion.
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param Knew Camera intrinsic matrix of the distorted image. By default, it is the identity matrix but you
     may additionally scale and shift the result by using a different matrix.
@@ -3872,7 +3920,7 @@ namespace fisheye
 
     /** @brief Estimates new camera intrinsic matrix for undistortion or rectification.
 
-    @param K Camera intrinsic matrix \f$cameramatrix{K}\f$.
+    @param K Camera intrinsic matrix \f$\cameramatrix{K}\f$.
     @param image_size Size of the image
     @param D Input vector of distortion coefficients \f$\distcoeffsfisheye\f$.
     @param R Rectification transformation in the object space: 3x3 1-channel, or vector: 3x1/1x3
@@ -3899,7 +3947,7 @@ namespace fisheye
     @ref fisheye::CALIB_USE_INTRINSIC_GUESS is specified, some or all of fx, fy, cx, cy must be
     initialized before calling the function.
     @param D Output vector of distortion coefficients \f$\distcoeffsfisheye\f$.
-    @param rvecs Output vector of rotation vectors (see Rodrigues ) estimated for each pattern view.
+    @param rvecs Output vector of rotation vectors (see @ref Rodrigues ) estimated for each pattern view.
     That is, each k-th rotation vector together with the corresponding k-th translation vector (see
     the next output parameter description) brings the calibration pattern from the model coordinate
     space (in which object points are specified) to the world coordinate space, that is, a real
@@ -3941,7 +3989,7 @@ optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \
     camera.
     @param P2 Output 3x4 projection matrix in the new (rectified) coordinate systems for the second
     camera.
-    @param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see reprojectImageTo3D ).
+    @param Q Output \f$4 \times 4\f$ disparity-to-depth mapping matrix (see #reprojectImageTo3D ).
     @param flags Operation flags that may be zero or @ref fisheye::CALIB_ZERO_DISPARITY . If the flag is set,
     the function makes the principal points of each camera have the same pixel coordinates in the
     rectified views. And if the flag is not set, the function may still shift the images in the
@@ -3977,6 +4025,15 @@ optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \
     @param imageSize Size of the image used only to initialize camera intrinsic matrix.
     @param R Output rotation matrix between the 1st and the 2nd camera coordinate systems.
     @param T Output translation vector between the coordinate systems of the cameras.
+    @param rvecs Output vector of rotation vectors ( @ref Rodrigues ) estimated for each pattern view in the
+    coordinate system of the first camera of the stereo pair (e.g. std::vector<cv::Mat>). More in detail, each
+    i-th rotation vector together with the corresponding i-th translation vector (see the next output parameter
+    description) brings the calibration pattern from the object coordinate space (in which object points are
+    specified) to the camera coordinate space of the first camera of the stereo pair. In more technical terms,
+    the tuple of the i-th rotation and translation vector performs a change of basis from object coordinate space
+    to camera coordinate space of the first camera of the stereo pair.
+    @param tvecs Output vector of translation vectors estimated for each pattern view, see parameter description
+    of previous output parameter ( rvecs ).
     @param flags Different flags that may be zero or a combination of the following values:
     -    @ref fisheye::CALIB_FIX_INTRINSIC  Fix K1, K2? and D1, D2? so that only R, T matrices
     are estimated.
@@ -3991,11 +4048,56 @@ optimization. It is the \f$max(width,height)/\pi\f$ or the provided \f$f_x\f$, \
     zero.
     @param criteria Termination criteria for the iterative optimization algorithm.
      */
+    CV_EXPORTS_W double stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
+                                  InputOutputArray K1, InputOutputArray D1, InputOutputArray K2, InputOutputArray D2, Size imageSize,
+                                  OutputArray R, OutputArray T, OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs, int flags = fisheye::CALIB_FIX_INTRINSIC,
+                                  TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
+
+    /// @overload
     CV_EXPORTS_W double stereoCalibrate(InputArrayOfArrays objectPoints, InputArrayOfArrays imagePoints1, InputArrayOfArrays imagePoints2,
                                   InputOutputArray K1, InputOutputArray D1, InputOutputArray K2, InputOutputArray D2, Size imageSize,
                                   OutputArray R, OutputArray T, int flags = fisheye::CALIB_FIX_INTRINSIC,
                                   TermCriteria criteria = TermCriteria(TermCriteria::COUNT + TermCriteria::EPS, 100, DBL_EPSILON));
 
+    /**
+    @brief Finds an object pose from 3D-2D point correspondences for fisheye camera moodel.
+
+    @param objectPoints Array of object points in the object coordinate space, Nx3 1-channel or
+    1xN/Nx1 3-channel, where N is the number of points. vector\<Point3d\> can be also passed here.
+    @param imagePoints Array of corresponding image points, Nx2 1-channel or 1xN/Nx1 2-channel,
+    where N is the number of points. vector\<Point2d\> can be also passed here.
+    @param cameraMatrix Input camera intrinsic matrix \f$\cameramatrix{A}\f$ .
+    @param distCoeffs Input vector of distortion coefficients (4x1/1x4).
+    @param rvec Output rotation vector (see @ref Rodrigues ) that, together with tvec, brings points from
+    the model coordinate system to the camera coordinate system.
+    @param tvec Output translation vector.
+    @param useExtrinsicGuess Parameter used for #SOLVEPNP_ITERATIVE. If true (1), the function uses
+    the provided rvec and tvec values as initial approximations of the rotation and translation
+    vectors, respectively, and further optimizes them.
+    @param flags Method for solving a PnP problem: see @ref calib3d_solvePnP_flags
+    This function returns the rotation and the translation vectors that transform a 3D point expressed in the object
+    coordinate frame to the camera coordinate frame, using different methods:
+    - P3P methods (@ref SOLVEPNP_P3P, @ref SOLVEPNP_AP3P): need 4 input points to return a unique solution.
+    - @ref SOLVEPNP_IPPE Input points must be >= 4 and object points must be coplanar.
+    - @ref SOLVEPNP_IPPE_SQUARE Special case suitable for marker pose estimation.
+    Number of input points must be 4. Object points must be defined in the following order:
+    - point 0: [-squareLength / 2,  squareLength / 2, 0]
+    - point 1: [ squareLength / 2,  squareLength / 2, 0]
+    - point 2: [ squareLength / 2, -squareLength / 2, 0]
+    - point 3: [-squareLength / 2, -squareLength / 2, 0]
+    - for all the other flags, number of input points must be >= 4 and object points can be in any configuration.
+    @param criteria Termination criteria for internal undistortPoints call.
+    The function interally undistorts points with @ref undistortPoints and call @ref cv::solvePnP,
+    thus the input are very similar. More information about Perspective-n-Points is described in @ref calib3d_solvePnP
+    for more information.
+    */
+    CV_EXPORTS_W bool solvePnP( InputArray objectPoints, InputArray imagePoints,
+                                InputArray cameraMatrix, InputArray distCoeffs,
+                                OutputArray rvec, OutputArray tvec,
+                                bool useExtrinsicGuess = false, int flags = SOLVEPNP_ITERATIVE,
+                                TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 10, 1e-8)
+                              );
+
 //! @} calib3d_fisheye
 } // end namespace fisheye
 
diff --git a/3rdParty/opencv2/core.hpp b/3rdParty/opencv2/core.hpp
index 69d03236b3..cafa7bda27 100644
--- a/3rdParty/opencv2/core.hpp
+++ b/3rdParty/opencv2/core.hpp
@@ -60,15 +60,16 @@
 
 /**
 @defgroup core Core functionality
+
+The Core module is the backbone of OpenCV, offering fundamental data structures, matrix operations,
+and utility functions that other modules depend on. It’s essential for handling image data,
+performing mathematical computations, and managing memory efficiently within the OpenCV ecosystem.
+
 @{
     @defgroup core_basic Basic structures
-    @defgroup core_c C structures and operations
-    @{
-        @defgroup core_c_glue Connections with C++
-    @}
     @defgroup core_array Operations on arrays
     @defgroup core_async Asynchronous API
-    @defgroup core_xml XML/YAML Persistence
+    @defgroup core_xml XML/YAML/JSON Persistence
     @defgroup core_cluster Clustering
     @defgroup core_utils Utility and system functions and macros
     @{
@@ -80,7 +81,6 @@
         @defgroup core_utils_samples Utility functions for OpenCV samples
     @}
     @defgroup core_opengl OpenGL interoperability
-    @defgroup core_ipp Intel IPP Asynchronous C/C++ Converters
     @defgroup core_optim Optimization Algorithms
     @defgroup core_directx DirectX interoperability
     @defgroup core_eigen Eigen support
@@ -100,6 +100,7 @@
     @{
         @defgroup core_parallel_backend Parallel backends API
     @}
+    @defgroup core_quaternion Quaternion
 @}
  */
 
@@ -127,12 +128,12 @@ class CV_EXPORTS Exception : public std::exception
      Instead, the macros CV_Error(), CV_Error_() and CV_Assert() are used.
     */
     Exception(int _code, const String& _err, const String& _func, const String& _file, int _line);
-    virtual ~Exception() throw();
+    virtual ~Exception() CV_NOEXCEPT;
 
     /*!
      \return the error description and the context as a text string.
     */
-    virtual const char *what() const throw() CV_OVERRIDE;
+    virtual const char *what() const CV_NOEXCEPT CV_OVERRIDE;
     void formatMessage();
 
     String msg; ///< the formatted error message
@@ -167,7 +168,7 @@ enum SortFlags { SORT_EVERY_ROW    = 0, //!< each matrix row is sorted independe
 
 //! @} core_utils
 
-//! @addtogroup core
+//! @addtogroup core_array
 //! @{
 
 //! Covariation flags
@@ -206,52 +207,25 @@ enum CovarFlags {
     COVAR_COLS      = 16
 };
 
-//! @addtogroup core_cluster
-//!  @{
-
-//! k-Means flags
-enum KmeansFlags {
-    /** Select random initial centers in each attempt.*/
-    KMEANS_RANDOM_CENTERS     = 0,
-    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
-    KMEANS_PP_CENTERS         = 2,
-    /** During the first (and possibly the only) attempt, use the
-        user-supplied labels instead of computing them from the initial centers. For the second and
-        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
-        to specify the exact method.*/
-    KMEANS_USE_INITIAL_LABELS = 1
-};
-
-//! @} core_cluster
-
-//! @addtogroup core_array
-//! @{
-
 enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
                    REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
                    REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
-                   REDUCE_MIN = 3  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_MIN = 3,  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_SUM2 = 4  //!< the output is the sum of all squared rows/columns of the matrix.
                  };
 
-//! @} core_array
-
 /** @brief Swaps two matrices
 */
 CV_EXPORTS void swap(Mat& a, Mat& b);
 /** @overload */
 CV_EXPORTS void swap( UMat& a, UMat& b );
 
-//! @} core
-
-//! @addtogroup core_array
-//! @{
-
 /** @brief Computes the source location of an extrapolated pixel.
 
 The function computes and returns the coordinate of a donor pixel corresponding to the specified
 extrapolated pixel when using the specified extrapolation border mode. For example, if you use
 cv::BORDER_WRAP mode in the horizontal direction, cv::BORDER_REFLECT_101 in the vertical direction and
-want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img , it
+want to compute value of the "virtual" pixel Point(-5, 100) in a floating-point image img, it
 looks like:
 @code{.cpp}
     float val = img.at<float>(borderInterpolate(100, img.rows, cv::BORDER_REFLECT_101),
@@ -262,7 +236,7 @@ copyMakeBorder.
 @param p 0-based coordinate of the extrapolated pixel along one of the axes, likely \<0 or \>= len
 @param len Length of the array along the corresponding axis.
 @param borderType Border type, one of the #BorderTypes, except for #BORDER_TRANSPARENT and
-#BORDER_ISOLATED . When borderType==#BORDER_CONSTANT , the function always returns -1, regardless
+#BORDER_ISOLATED. When borderType==#BORDER_CONSTANT, the function always returns -1, regardless
 of p and len.
 
 @sa copyMakeBorder
@@ -348,6 +322,9 @@ be set to the default -1. In this case, the output array will have the same dept
 array, be it src1, src2 or both.
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`add(src,X)` means `add(src,(X,X,X,X))`.
+`add(src,(X,))` means `add(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array that has the same size and number of channels as the input array(s); the
@@ -389,6 +366,9 @@ in the first case, when src1.depth() == src2.depth(), dtype can be set to the de
 case the output array will have the same depth as the input array, be it src1, src2 or both.
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`subtract(src,X)` means `subtract(src,(X,X,X,X))`.
+`subtract(src,(X,))` means `subtract(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array of the same size and the same number of channels as the input array.
@@ -414,6 +394,9 @@ For a not-per-element matrix product, see gemm .
 @note Saturation is not applied when the output array has the depth
 CV_32S. You may even get result of an incorrect sign in the case of
 overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`multiply(src,X)` means `multiply(src,(X,X,X,X))`.
+`multiply(src,(X,))` means `multiply(src,(X,0,0,0))`.
 @param src1 first input array.
 @param src2 second input array of the same size and the same type as src1.
 @param dst output array of the same size and type as src1.
@@ -442,6 +425,9 @@ Expect correct IEEE-754 behaviour for floating-point data (with NaN, Inf result
 
 @note Saturation is not applied when the output array has the depth CV_32S. You may even get
 result of an incorrect sign in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`divide(src,X)` means `divide(src,(X,X,X,X))`.
+`divide(src,(X,))` means `divide(src,(X,0,0,0))`.
 @param src1 first input array.
 @param src2 second input array of the same size and type as src1.
 @param scale scalar factor.
@@ -477,10 +463,6 @@ The function can also be emulated with a matrix expression, for example:
 */
 CV_EXPORTS_W void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst);
 
-/** @example samples/cpp/tutorial_code/HighGUI/AddingImagesTrackbar.cpp
-Check @ref tutorial_trackbar "the corresponding tutorial" for more details
-*/
-
 /** @brief Calculates the weighted sum of two arrays.
 
 The function addWeighted calculates the weighted sum of two arrays as follows:
@@ -543,9 +525,15 @@ The format of half precision floating point is defined in IEEE 754-2008.
 
 @param src input array.
 @param dst output array.
+
+@deprecated Use Mat::convertTo with CV_16F instead.
 */
 CV_EXPORTS_W void convertFp16(InputArray src, OutputArray dst);
 
+/** @example samples/cpp/tutorial_code/core/how_to_scan_images/how_to_scan_images.cpp
+Check @ref tutorial_how_to_scan_images "the corresponding tutorial" for more details
+*/
+
 /** @brief Performs a look-up table transform of an array.
 
 The function LUT fills the output array with values from the look-up table. Indices of the entries
@@ -571,12 +559,40 @@ independently for each channel.
 */
 CV_EXPORTS_AS(sumElems) Scalar sum(InputArray src);
 
+/** @brief Checks for the presence of at least one non-zero array element.
+
+The function returns whether there are non-zero elements in src
+
+The function do not work with multi-channel arrays. If you need to check non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
+- If the count of non-zero array elements is important, @ref countNonZero is helpful.
+@param src single-channel array.
+@sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, countNonZero
+*/
+CV_EXPORTS_W bool hasNonZero( InputArray src );
+
 /** @brief Counts non-zero array elements.
 
 The function returns the number of non-zero elements in src :
 \f[\sum _{I: \; \texttt{src} (I) \ne0 } 1\f]
+
+The function do not work with multi-channel arrays. If you need to count non-zero array
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
+- If the location of non-zero array elements is important, @ref findNonZero is helpful.
 @param src single-channel array.
 @sa  mean, meanStdDev, norm, minMaxLoc, calcCovarMatrix
+@sa  findNonZero, hasNonZero
 */
 CV_EXPORTS_W int countNonZero( InputArray src );
 
@@ -603,8 +619,18 @@ or
     // access pixel coordinates
     Point pnt = locations[i];
 @endcode
+
+The function do not work with multi-channel arrays. If you need to find non-zero
+elements across all the channels, use Mat::reshape first to reinterpret the array as
+single-channel. Or you may extract the particular channel using either extractImageCOI, or
+mixChannels, or split.
+
+@note
+- If only count of non-zero array elements is important, @ref countNonZero is helpful.
+- If only whether there are non-zero elements is important, @ref hasNonZero is helpful.
 @param src single-channel array
 @param idx the output array, type of cv::Mat or std::vector<Point>, corresponding to non-zero indices in the input
+@sa  countNonZero, hasNonZero
 */
 CV_EXPORTS_W void findNonZero( InputArray src, OutputArray idx );
 
@@ -806,13 +832,21 @@ CV_EXPORTS void normalize( const SparseMat& src, SparseMat& dst, double alpha, i
 /** @brief Finds the global minimum and maximum in an array.
 
 The function cv::minMaxLoc finds the minimum and maximum element values and their positions. The
-extremums are searched across the whole array or, if mask is not an empty array, in the specified
+extrema are searched across the whole array or, if mask is not an empty array, in the specified
 array region.
 
-The function do not work with multi-channel arrays. If you need to find minimum or maximum
-elements across all the channels, use Mat::reshape first to reinterpret the array as
-single-channel. Or you may extract the particular channel using either extractImageCOI , or
-mixChannels , or split .
+In C++, if the input is multi-channel, you should omit the minLoc, maxLoc, and mask arguments
+(i.e. leave them as NULL, NULL, and noArray() respectively). These arguments are not
+supported for multi-channel input arrays. If working with multi-channel input and you
+need the minLoc, maxLoc, or mask arguments, then use Mat::reshape first to reinterpret
+the array as single-channel. Alternatively, you can extract the particular channel using either
+extractImageCOI, mixChannels, or split.
+
+In Python, multi-channel input is not supported at all due to a limitation in the
+binding generation process (there is no way to set minLoc and maxLoc to NULL). A
+workaround is to operate on each channel individually or to use NumPy to achieve the same
+functionality.
+
 @param src input single-channel array.
 @param minVal pointer to the returned minimum value; NULL is used if not required.
 @param maxVal pointer to the returned maximum value; NULL is used if not required.
@@ -863,11 +897,8 @@ CV_EXPORTS_W void reduceArgMax(InputArray src, OutputArray dst, int axis, bool l
 
 The function cv::minMaxIdx finds the minimum and maximum element values and their positions. The
 extremums are searched across the whole array or, if mask is not an empty array, in the specified
-array region. The function does not work with multi-channel arrays. If you need to find minimum or
-maximum elements across all the channels, use Mat::reshape first to reinterpret the array as
-single-channel. Or you may extract the particular channel using either extractImageCOI , or
-mixChannels , or split . In case of a sparse matrix, the minimum is found among non-zero elements
-only.
+array region. In case of a sparse matrix, the minimum is found among non-zero elements
+only. Multi-channel input is supported without mask and extremums indexes (should be nullptr).
 @note When minIdx is not NULL, it must have at least 2 elements (as well as maxIdx), even if src is
 a single-row or single-column matrix. In OpenCV (following MATLAB) each array has at least 2
 dimensions, i.e. single-column matrix is Mx1 matrix (and therefore minIdx/maxIdx will be
@@ -902,8 +933,8 @@ CV_EXPORTS void minMaxLoc(const SparseMat& a, double* minVal,
 The function #reduce reduces the matrix to a vector by treating the matrix rows/columns as a set of
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
-raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
-In case of #REDUCE_SUM and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+raster image. In case of #REDUCE_MAX and #REDUCE_MIN, the output image should have the same type as the source one.
+In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG, the output may have a larger element bit-depth to preserve accuracy.
 And multi-channel arrays are also supported in these two reduction modes.
 
 The following code demonstrates its usage for a single channel matrix.
@@ -957,7 +988,7 @@ CV_EXPORTS_W void merge(InputArrayOfArrays mv, OutputArray dst);
 The function cv::split splits a multi-channel array into separate single-channel arrays:
 \f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
 If you need to extract a single channel or do some other sophisticated channel permutation, use
-mixChannels .
+mixChannels.
 
 The following example demonstrates how to split a 3-channel matrix into 3 single channel matrices.
 @snippet snippets/core_split.cpp example
@@ -1098,10 +1129,24 @@ The example scenarios of using the function are the following:
 flipping around the x-axis and positive value (for example, 1) means
 flipping around y-axis. Negative value (for example, -1) means flipping
 around both axes.
-@sa transpose , repeat , completeSymm
+@sa transpose, repeat, completeSymm
 */
 CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode);
 
+/** @brief Flips a n-dimensional at given axis
+ *  @param src input array
+ *  @param dst output array that has the same shape of src
+ *  @param axis axis that performs a flip on. 0 <= axis < src.dims.
+ */
+CV_EXPORTS_W void flipND(InputArray src, OutputArray dst, int axis);
+
+/** @brief Broadcast the given Mat to the given shape.
+ * @param src input array
+ * @param shape target shape. Should be a list of CV_32S numbers. Note that negative values are not supported.
+ * @param dst output array that has the given shape
+ */
+CV_EXPORTS_W void broadcast(InputArray src, InputArray shape, OutputArray dst);
+
 enum RotateFlags {
     ROTATE_90_CLOCKWISE = 0, //!<Rotate 90 degrees clockwise
     ROTATE_180 = 1, //!<Rotate 180 degrees clockwise
@@ -1116,7 +1161,7 @@ The function cv::rotate rotates the array in one of three different ways:
 @param dst output array of the same type as src.  The size is the same with ROTATE_180,
 and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
 @param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
-@sa transpose , repeat , completeSymm, flip, RotateFlags
+@sa transpose, repeat, completeSymm, flip, RotateFlags
 */
 CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode);
 
@@ -1389,6 +1434,9 @@ The function cv::absdiff calculates:
     multi-channel arrays, each channel is processed independently.
 @note Saturation is not applied when the arrays have the depth CV_32S.
 You may even get a negative value in the case of overflow.
+@note (Python) Be careful to difference behaviour between src1/src2 are single number and they are tuple/array.
+`absdiff(src,X)` means `absdiff(src,(X,X,X,X))`.
+`absdiff(src,(X,))` means `absdiff(src,(X,0,0,0))`.
 @param src1 first input array or a scalar.
 @param src2 second input array or a scalar.
 @param dst output array that has the same size and type as input arrays.
@@ -1547,7 +1595,7 @@ converts denormalized values to zeros on output. Special values (NaN,
 Inf) are not handled.
 @param src input array.
 @param dst output array of the same size and type as src.
-@sa log , cartToPolar , polarToCart , phase , pow , sqrt , magnitude
+@sa log, cartToPolar, polarToCart, phase, pow, sqrt, magnitude
 */
 CV_EXPORTS_W void exp(InputArray src, OutputArray dst);
 
@@ -1659,7 +1707,7 @@ elements.
 CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
                             double minVal = -DBL_MAX, double maxVal = DBL_MAX);
 
-/** @brief converts NaNs to the given number
+/** @brief Replaces NaNs by given number
 @param a input/output matrix (CV_32F type).
 @param val value to convert the NaNs
 */
@@ -1691,7 +1739,7 @@ should have the same type as src1 and src2.
 @param dst output matrix; it has the proper size and the same type as
 input matrices.
 @param flags operation flags (cv::GemmFlags)
-@sa mulTransposed , transform
+@sa mulTransposed, transform
 */
 CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
                        InputArray src3, double beta, OutputArray dst, int flags = 0);
@@ -1701,7 +1749,7 @@ CV_EXPORTS_W void gemm(InputArray src1, InputArray src2, double alpha,
 The function cv::mulTransposed calculates the product of src and its
 transposition:
 \f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} )^T ( \texttt{src} - \texttt{delta} )\f]
-if aTa=true , and
+if aTa=true, and
 \f[\texttt{dst} = \texttt{scale} ( \texttt{src} - \texttt{delta} ) ( \texttt{src} - \texttt{delta} )^T\f]
 otherwise. The function is used to calculate the covariance matrix. With
 zero delta, it can be used as a faster substitute for general matrix
@@ -1714,7 +1762,7 @@ description below.
 @param delta Optional delta matrix subtracted from src before the
 multiplication. When the matrix is empty ( delta=noArray() ), it is
 assumed to be zero, that is, nothing is subtracted. If it has the same
-size as src , it is simply subtracted. Otherwise, it is "repeated" (see
+size as src, it is simply subtracted. Otherwise, it is "repeated" (see
 repeat ) to cover the full src and then subtracted. Type of the delta
 matrix, when it is not empty, must be the same as the type of created
 output matrix. See the dtype parameter description below.
@@ -1744,7 +1792,7 @@ CV_EXPORTS_W void transpose(InputArray src, OutputArray dst);
  * @note Input should be continuous single-channel matrix.
  * @param src input array.
  * @param order a permutation of [0,1,..,N-1] where N is the number of axes of src.
- * The i’th axis of dst will correspond to the axis numbered order[i] of the input.
+ * The i'th axis of dst will correspond to the axis numbered order[i] of the input.
  * @param dst output array of the same type as src.
  */
 CV_EXPORTS_W void transposeND(InputArray src, const std::vector<int>& order, OutputArray dst);
@@ -1988,7 +2036,7 @@ in the descending order.
 @param eigenvectors output matrix of eigenvectors; it has the same size and type as src; the
 eigenvectors are stored as subsequent matrix rows, in the same order as the corresponding
 eigenvalues.
-@sa eigenNonSymmetric, completeSymm , PCA
+@sa eigenNonSymmetric, completeSymm, PCA
 */
 CV_EXPORTS_W bool eigen(InputArray src, OutputArray eigenvalues,
                         OutputArray eigenvectors = noArray());
@@ -2128,7 +2176,7 @@ So, the function chooses an operation mode depending on the flags and size of th
 
 If #DFT_SCALE is set, the scaling is done after the transformation.
 
-Unlike dct , the function supports arrays of arbitrary size. But only those arrays are processed
+Unlike dct, the function supports arrays of arbitrary size. But only those arrays are processed
 efficiently, whose sizes can be factorized in a product of small prime numbers (2, 3, and 5 in the
 current implementation). Such an efficient DFT size can be calculated using the getOptimalDFTSize
 method.
@@ -2211,8 +2259,8 @@ nonzeroRows rows of the input array (#DFT_INVERSE is not set) or only the first
 output array (#DFT_INVERSE is set) contain non-zeros, thus, the function can handle the rest of the
 rows more efficiently and save some time; this technique is very useful for calculating array
 cross-correlation or convolution using DFT.
-@sa dct , getOptimalDFTSize , mulSpectrums, filter2D , matchTemplate , flip , cartToPolar ,
-magnitude , phase
+@sa dct, getOptimalDFTSize, mulSpectrums, filter2D, matchTemplate, flip, cartToPolar,
+magnitude, phase
 */
 CV_EXPORTS_W void dft(InputArray src, OutputArray dst, int flags = 0, int nonzeroRows = 0);
 
@@ -2249,9 +2297,9 @@ floating-point array:
     \f[X =  \left (C^{(N)} \right )^T  \cdot X  \cdot C^{(N)}\f]
 
 The function chooses the mode of operation by looking at the flags and size of the input array:
--   If (flags & #DCT_INVERSE) == 0 , the function does a forward 1D or 2D transform. Otherwise, it
+-   If (flags & #DCT_INVERSE) == 0, the function does a forward 1D or 2D transform. Otherwise, it
     is an inverse 1D or 2D transform.
--   If (flags & #DCT_ROWS) != 0 , the function performs a 1D transform of each row.
+-   If (flags & #DCT_ROWS) != 0, the function performs a 1D transform of each row.
 -   If the array is a single column or a single row, the function performs a 1D transform.
 -   If none of the above is true, the function performs a 2D transform.
 
@@ -2267,7 +2315,7 @@ of a vector of size N/2 . Thus, the optimal DCT size N1 \>= N can be calculated
 @param src input floating-point array.
 @param dst output array of the same size and type as src .
 @param flags transformation flags as a combination of cv::DftFlags (DCT_*)
-@sa dft , getOptimalDFTSize , idct
+@sa dft, getOptimalDFTSize, idct
 */
 CV_EXPORTS_W void dct(InputArray src, OutputArray dst, int flags = 0);
 
@@ -2286,7 +2334,7 @@ CV_EXPORTS_W void idct(InputArray src, OutputArray dst, int flags = 0);
 The function cv::mulSpectrums performs the per-element multiplication of the two CCS-packed or complex
 matrices that are results of a real or complex Fourier transform.
 
-The function, together with dft and idft , may be used to calculate convolution (pass conjB=false )
+The function, together with dft and idft, may be used to calculate convolution (pass conjB=false )
 or correlation (pass conjB=true ) of two arrays rapidly. When the arrays are complex, they are
 simply multiplied (per element) with an optional conjugation of the second-array elements. When the
 arrays are real, they are assumed to be CCS-packed (see dft for details).
@@ -2320,7 +2368,7 @@ While the function cannot be used directly to estimate the optimal vector size f
 (since the current DCT implementation supports only even-size vectors), it can be easily processed
 as getOptimalDFTSize((vecsize+1)/2)\*2.
 @param vecsize vector size.
-@sa dft , dct , idft , idct , mulSpectrums
+@sa dft, dct, idft, idct, mulSpectrums
 */
 CV_EXPORTS_W int getOptimalDFTSize(int vecsize);
 
@@ -2872,7 +2920,7 @@ class CV_EXPORTS RNG
 
     The methods transform the state using the MWC algorithm and return the
     next random number. The first form is equivalent to RNG::next . The
-    second form returns the random number modulo N , which means that the
+    second form returns the random number modulo N, which means that the
     result is in the range [0, N) .
     */
     unsigned operator ()();
@@ -3014,8 +3062,21 @@ class CV_EXPORTS RNG_MT19937
 //! @addtogroup core_cluster
 //!  @{
 
+//! k-means flags
+enum KmeansFlags {
+    /** Select random initial centers in each attempt.*/
+    KMEANS_RANDOM_CENTERS     = 0,
+    /** Use kmeans++ center initialization by Arthur and Vassilvitskii [Arthur2007].*/
+    KMEANS_PP_CENTERS         = 2,
+    /** During the first (and possibly the only) attempt, use the
+        user-supplied labels instead of computing them from the initial centers. For the second and
+        further attempts, use the random or semi-random centers. Use one of KMEANS_\*_CENTERS flag
+        to specify the exact method.*/
+    KMEANS_USE_INITIAL_LABELS = 1
+};
+
 /** @example samples/cpp/kmeans.cpp
-An example on K-means clustering
+An example on k-means clustering
 */
 
 /** @brief Finds centers of clusters and groups input samples around the clusters.
@@ -3025,7 +3086,7 @@ and groups the input samples around the clusters. As an output, \f$\texttt{bestL
 0-based cluster index for the sample stored in the \f$i^{th}\f$ row of the samples matrix.
 
 @note
--   (Python) An example on K-means clustering can be found at
+-   (Python) An example on k-means clustering can be found at
     opencv_source_code/samples/python/kmeans.py
 @param data Data for clustering. An array of N-Dimensional points with float coordinates is needed.
 Examples of this array can be:
@@ -3142,12 +3203,16 @@ class CV_EXPORTS_W Algorithm
 
     /** @brief Stores algorithm parameters in a file storage
     */
-    virtual void write(FileStorage& fs) const { CV_UNUSED(fs); }
+    CV_WRAP virtual void write(FileStorage& fs) const { CV_UNUSED(fs); }
 
-    /** @brief simplified API for language bindings
+    /**
     * @overload
     */
-    CV_WRAP void write(const Ptr<FileStorage>& fs, const String& name = String()) const;
+    CV_WRAP void write(FileStorage& fs, const String& name) const;
+#if CV_VERSION_MAJOR < 5
+    /** @deprecated */
+    void write(const Ptr<FileStorage>& fs, const String& name = String()) const;
+#endif
 
     /** @brief Reads algorithm parameters from a file storage
     */
diff --git a/3rdParty/opencv2/core/affine.hpp b/3rdParty/opencv2/core/affine.hpp
index 5b01b35f2c..bb5774231c 100644
--- a/3rdParty/opencv2/core/affine.hpp
+++ b/3rdParty/opencv2/core/affine.hpp
@@ -51,7 +51,7 @@
 namespace cv
 {
 
-//! @addtogroup core
+//! @addtogroup core_eigen
 //! @{
 
     /** @brief Affine transform
diff --git a/3rdParty/opencv2/core/async.hpp b/3rdParty/opencv2/core/async.hpp
index 4c0b298f8a..4c56354d28 100644
--- a/3rdParty/opencv2/core/async.hpp
+++ b/3rdParty/opencv2/core/async.hpp
@@ -7,10 +7,8 @@
 
 #include <opencv2/core/mat.hpp>
 
-#ifdef CV_CXX11
 //#include <future>
 #include <chrono>
-#endif
 
 namespace cv {
 
@@ -69,7 +67,6 @@ class CV_EXPORTS_W AsyncArray
 
     CV_WRAP bool valid() const CV_NOEXCEPT;
 
-#ifdef CV_CXX11
     inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
     inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
 
@@ -89,7 +86,6 @@ class CV_EXPORTS_W AsyncArray
     std::future<Mat> getFutureMat() const;
     std::future<UMat> getFutureUMat() const;
 #endif
-#endif
 
 
     // PImpl
diff --git a/3rdParty/opencv2/core/base.hpp b/3rdParty/opencv2/core/base.hpp
index 64c6c05144..4e810931cb 100644
--- a/3rdParty/opencv2/core/base.hpp
+++ b/3rdParty/opencv2/core/base.hpp
@@ -271,11 +271,11 @@ enum BorderTypes {
     BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
     BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
     BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
-    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno` - Treats outliers as transparent.
 
     BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
     BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
-    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+    BORDER_ISOLATED    = 16 //!< Interpolation restricted within the ROI boundaries.
 };
 
 //! @} core_array
@@ -288,14 +288,29 @@ enum BorderTypes {
 By default the function prints information about the error to stderr,
 then it either stops if setBreakOnError() had been called before or raises the exception.
 It is possible to alternate error processing by using redirectError().
-@param _code - error code (Error::Code)
-@param _err - error description
-@param _func - function name. Available only when the compiler supports getting it
-@param _file - source file name where the error has occurred
-@param _line - line number in the source file where the error has occurred
+@param code - error code (Error::Code)
+@param err - error description
+@param func - function name. Available only when the compiler supports getting it
+@param file - source file name where the error has occurred
+@param line - line number in the source file where the error has occurred
 @see CV_Error, CV_Error_, CV_Assert, CV_DbgAssert
  */
-CV_EXPORTS CV_NORETURN void error(int _code, const String& _err, const char* _func, const char* _file, int _line);
+CV_EXPORTS CV_NORETURN void error(int code, const String& err, const char* func, const char* file, int line);
+
+/*! @brief Signals an error and terminate application.
+
+By default the function prints information about the error to stderr, then it terminates application
+with std::terminate. The function is designed for invariants check in functions and methods with
+noexcept attribute.
+@param code - error code (Error::Code)
+@param err - error description
+@param func - function name. Available only when the compiler supports getting it
+@param file - source file name where the error has occurred
+@param line - line number in the source file where the error has occurred
+@see CV_AssertTerminate
+ */
+CV_EXPORTS CV_NORETURN void terminate(int code, const String& err, const char* func, const char* file, int line) CV_NOEXCEPT;
+
 
 #ifdef CV_STATIC_ANALYSIS
 
@@ -338,8 +353,11 @@ for example:
 The macros CV_Assert (and CV_DbgAssert(expr)) evaluate the specified expression. If it is 0, the macros
 raise an error (see cv::error). The macro CV_Assert checks the condition in both Debug and Release
 configurations while CV_DbgAssert is only retained in the Debug configuration.
+CV_AssertTerminate is analog of CV_Assert for invariants check in functions with noexcept attribute.
+It does not throw exception, but terminates the application.
 */
 #define CV_Assert( expr ) do { if(!!(expr)) ; else cv::error( cv::Error::StsAssert, #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
+#define CV_AssertTerminate( expr ) do { if(!!(expr)) ; else cv::terminate( #expr, CV_Func, __FILE__, __LINE__ ); } while(0)
 
 #endif // CV_STATIC_ANALYSIS
 
diff --git a/3rdParty/opencv2/core/bindings_utils.hpp b/3rdParty/opencv2/core/bindings_utils.hpp
index 9f96a01eda..0e18693e24 100644
--- a/3rdParty/opencv2/core/bindings_utils.hpp
+++ b/3rdParty/opencv2/core/bindings_utils.hpp
@@ -35,6 +35,14 @@ String dumpInt(int argument)
     return cv::format("Int: %d", argument);
 }
 
+CV_WRAP static inline
+String dumpInt64(int64 argument)
+{
+    std::ostringstream oss("Int64: ", std::ios::ate);
+    oss << argument;
+    return oss.str();
+}
+
 CV_WRAP static inline
 String dumpSizeT(size_t argument)
 {
@@ -67,20 +75,6 @@ String dumpString(const String& argument)
     return cv::format("String: %s", argument.c_str());
 }
 
-CV_WRAP static inline
-String testOverloadResolution(int value, const Point& point = Point(42, 24))
-{
-    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
-                  point.y);
-}
-
-CV_WRAP static inline
-String testOverloadResolution(const Rect& rect)
-{
-    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
-                  rect.width, rect.height);
-}
-
 CV_WRAP static inline
 String dumpRect(const Rect& argument)
 {
@@ -103,6 +97,42 @@ String dumpRotatedRect(const RotatedRect& argument)
                   argument.size.height, argument.angle);
 }
 
+CV_WRAP static inline
+String dumpRange(const Range& argument)
+{
+    if (argument == Range::all())
+    {
+        return "range: all";
+    }
+    else
+    {
+        return format("range: (s=%d, e=%d)", argument.start, argument.end);
+    }
+}
+
+CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
+
+CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
+
+CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
+
+
+//! @cond IGNORED
+
+CV_WRAP static inline
+String testOverloadResolution(int value, const Point& point = Point(42, 24))
+{
+    return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
+                  point.y);
+}
+
+CV_WRAP static inline
+String testOverloadResolution(const Rect& rect)
+{
+    return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
+                  rect.width, rect.height);
+}
+
 CV_WRAP static inline
 RotatedRect testRotatedRect(float x, float y, float w, float h, float angle)
 {
@@ -118,19 +148,6 @@ std::vector<RotatedRect> testRotatedRectVector(float x, float y, float w, float
     return result;
 }
 
-CV_WRAP static inline
-String dumpRange(const Range& argument)
-{
-    if (argument == Range::all())
-    {
-        return "range: all";
-    }
-    else
-    {
-        return format("range: (s=%d, e=%d)", argument.start, argument.end);
-    }
-}
-
 CV_WRAP static inline
 int testOverwriteNativeMethod(int argument)
 {
@@ -143,12 +160,6 @@ String testReservedKeywordConversion(int positional_argument, int lambda = 2, in
     return format("arg=%d, lambda=%d, from=%d", positional_argument, lambda, from);
 }
 
-CV_EXPORTS_W String dumpVectorOfInt(const std::vector<int>& vec);
-
-CV_EXPORTS_W String dumpVectorOfDouble(const std::vector<double>& vec);
-
-CV_EXPORTS_W String dumpVectorOfRect(const std::vector<Rect>& vec);
-
 CV_WRAP static inline
 void generateVectorOfRect(size_t len, CV_OUT std::vector<Rect>& vec)
 {
@@ -219,6 +230,49 @@ AsyncArray testAsyncException()
     return p.getArrayResult();
 }
 
+CV_WRAP static inline
+String dumpVec2i(const cv::Vec2i value = cv::Vec2i(42, 24)) {
+    return format("Vec2i(%d, %d)", value[0], value[1]);
+}
+
+struct CV_EXPORTS_W_SIMPLE ClassWithKeywordProperties {
+    CV_PROP_RW int lambda;
+    CV_PROP int except;
+
+    CV_WRAP explicit ClassWithKeywordProperties(int lambda_arg = 24, int except_arg = 42)
+    {
+        lambda = lambda_arg;
+        except = except_arg;
+    }
+};
+
+struct CV_EXPORTS_W_PARAMS FunctionParams
+{
+    CV_PROP_RW int lambda = -1;
+    CV_PROP_RW float sigma = 0.0f;
+
+    FunctionParams& setLambda(int value) CV_NOEXCEPT
+    {
+        lambda = value;
+        return *this;
+    }
+
+    FunctionParams& setSigma(float value) CV_NOEXCEPT
+    {
+        sigma = value;
+        return *this;
+    }
+};
+
+CV_WRAP static inline String
+copyMatAndDumpNamedArguments(InputArray src, OutputArray dst,
+                             const FunctionParams& params = FunctionParams())
+{
+    src.copyTo(dst);
+    return format("lambda=%d, sigma=%.1f", params.lambda,
+                  params.sigma);
+}
+
 namespace nested {
 CV_WRAP static inline bool testEchoBooleanFunction(bool flag) {
     return flag;
@@ -272,6 +326,8 @@ class CV_EXPORTS_W CV_WRAP_AS(ExportClassName) OriginalClassName
 typedef OriginalClassName::Params OriginalClassName_Params;
 } // namespace nested
 
+//! @endcond IGNORED
+
 namespace fs {
     CV_EXPORTS_W cv::String getCacheDirectoryForDownloads();
 } // namespace fs
diff --git a/3rdParty/opencv2/core/bufferpool.hpp b/3rdParty/opencv2/core/bufferpool.hpp
index 90af9d0d81..5322c87953 100644
--- a/3rdParty/opencv2/core/bufferpool.hpp
+++ b/3rdParty/opencv2/core/bufferpool.hpp
@@ -15,7 +15,7 @@
 namespace cv
 {
 
-//! @addtogroup core
+//! @addtogroup core_opencl
 //! @{
 
 class BufferPoolController
diff --git a/3rdParty/opencv2/core/check.hpp b/3rdParty/opencv2/core/check.hpp
index 5dd206f656..dc1dd70820 100644
--- a/3rdParty/opencv2/core/check.hpp
+++ b/3rdParty/opencv2/core/check.hpp
@@ -65,6 +65,7 @@ struct CheckContext {
     static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
             { CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, "" message, "" p1_str, "" p2_str }
 
+CV_EXPORTS void CV_NORETURN check_failed_auto(const bool v1, const bool v2, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
@@ -74,6 +75,9 @@ CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, co
 CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
 
+CV_EXPORTS void CV_NORETURN check_failed_true(const bool v, const CheckContext& ctx);
+CV_EXPORTS void CV_NORETURN check_failed_false(const bool v, const CheckContext& ctx);
+
 CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
 CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
@@ -131,9 +135,18 @@ CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckCon
 /// Example: depth == CV_32F || depth == CV_64F
 #define CV_CheckDepth(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatDepth, t, (test_expr), #t, #test_expr, msg)
 
+/// Example: channel == 1 || channel == 3
+#define CV_CheckChannels(t, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, MatChannels, t, (test_expr), #t, #test_expr, msg)
+
 /// Example: v == A || v == B
 #define CV_Check(v, test_expr, msg)  CV__CHECK_CUSTOM_TEST(_, auto, v, (test_expr), #v, #test_expr, msg)
 
+/// Example: v == true
+#define CV_CheckTrue(v, msg)  CV__CHECK_CUSTOM_TEST(_, true, v, v, #v, "", msg)
+
+/// Example: v == false
+#define CV_CheckFalse(v, msg)  CV__CHECK_CUSTOM_TEST(_, false, v, (!(v)), #v, "", msg)
+
 /// Some complex conditions: CV_Check(src2, src2.empty() || (src2.type() == src1.type() && src2.size() == src1.size()), "src2 should have same size/type as src1")
 // TODO define pretty-printers
 
diff --git a/3rdParty/opencv2/core/cuda.hpp b/3rdParty/opencv2/core/cuda.hpp
index f1119db17b..53898171f2 100644
--- a/3rdParty/opencv2/core/cuda.hpp
+++ b/3rdParty/opencv2/core/cuda.hpp
@@ -118,6 +118,7 @@ class CV_EXPORTS_W GpuMat
     //! default allocator
     CV_WRAP static GpuMat::Allocator* defaultAllocator();
     CV_WRAP static void setDefaultAllocator(GpuMat::Allocator* allocator);
+    CV_WRAP static GpuMat::Allocator* getStdAllocator();
 
     //! default constructor
     CV_WRAP explicit GpuMat(GpuMat::Allocator* allocator = GpuMat::defaultAllocator());
@@ -198,16 +199,32 @@ class CV_EXPORTS_W GpuMat
     CV_WRAP GpuMat clone() const;
 
     //! copies the GpuMat content to device memory (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst) const;
+    void copyTo(OutputArray dst) const;
+    //! bindings overload which copies the GpuMat content to device memory (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst) const {
+        copyTo(static_cast<OutputArray>(dst));
+    }
 
     //! copies the GpuMat content to device memory (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, Stream& stream) const;
+    void copyTo(OutputArray dst, Stream& stream) const;
+    //! bindings overload which copies the GpuMat content to device memory (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), stream);
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask) const;
+    void copyTo(OutputArray dst, InputArray mask) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask));
+    }
 
     //! copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
-    CV_WRAP void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    void copyTo(OutputArray dst, InputArray mask, Stream& stream) const;
+    //! bindings overload which copies those GpuMat elements to "m" that are marked with non-zero mask elements (Non-Blocking call)
+    CV_WRAP void copyTo(CV_OUT GpuMat& dst, GpuMat& mask, Stream& stream) const {
+        copyTo(static_cast<OutputArray>(dst), static_cast<InputArray>(mask), stream);
+    }
 
     //! sets some of the GpuMat elements to s (Blocking call)
     CV_WRAP GpuMat& setTo(Scalar s);
@@ -222,19 +239,31 @@ class CV_EXPORTS_W GpuMat
     CV_WRAP GpuMat& setTo(Scalar s, InputArray mask, Stream& stream);
 
     //! converts GpuMat to another datatype (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype) const;
+    void convertTo(OutputArray dst, int rtype) const;
 
     //! converts GpuMat to another datatype (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, stream);
+    }
 
     //! converts GpuMat to another datatype with scaling (Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta = 0.0) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling(Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha = 1.0, double beta = 0.0) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta);
+    }
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const;
 
     //! converts GpuMat to another datatype with scaling (Non-Blocking call)
-    CV_WRAP void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    void convertTo(OutputArray dst, int rtype, double alpha, double beta, Stream& stream) const;
+    //! bindings overload which converts GpuMat to another datatype with scaling (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT GpuMat& dst, int rtype, double alpha, double beta, Stream& stream) const {
+        convertTo(static_cast<OutputArray>(dst), rtype, alpha, beta, stream);
+    }
 
     CV_WRAP void assignTo(GpuMat& m, int type = -1) const;
 
@@ -384,8 +413,9 @@ class CV_EXPORTS_W GpuMatND
     data, which means that no data is copied. This operation is very efficient and can be used to
     process external data using OpenCV functions. The external data is not automatically deallocated, so
     you should take care of it.
-    @param step Array of _size.size()-1 steps in case of a multi-dimensional array (the last step is always
-    set to the element size). If not specified, the matrix is assumed to be continuous.
+    @param step Array of _size.size() or _size.size()-1 steps in case of a multi-dimensional array
+    (if specified, the last step must be equal to the element size, otherwise it will be added as such).
+    If not specified, the matrix is assumed to be continuous.
     */
     GpuMatND(SizeArray size, int type, void* data, StepArray step = StepArray());
 
@@ -567,6 +597,29 @@ The function does not reallocate memory if the matrix has proper attributes alre
  */
 CV_EXPORTS_W void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr);
 
+/** @brief Bindings overload to create a GpuMat from existing GPU memory.
+@param rows Row count.
+@param cols Column count.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W GpuMat inline createGpuMatFromCudaMemory(int rows, int cols, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(rows, cols, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+}
+
+ /** @overload
+@param size 2D array size: Size(cols, rows). In the Size() constructor, the number of rows and the number of columns go in the reverse order.
+@param type Type of the matrix.
+@param cudaMemoryAddress Address of the allocated GPU memory on the device. This does not allocate matrix data. Instead, it just initializes the matrix header that points to the specified \a cudaMemoryAddress, which means that no data is copied. This operation is very efficient and can be used to process external data using OpenCV functions. The external data is not automatically deallocated, so you should take care of it.
+@param step Number of bytes each matrix row occupies. The value should include the padding bytes at the end of each row, if any. If the parameter is missing (set to Mat::AUTO_STEP ), no padding is assumed and the actual step is calculated as cols*elemSize(). See GpuMat::elemSize.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W inline GpuMat createGpuMatFromCudaMemory(Size size, int type, size_t cudaMemoryAddress, size_t step = Mat::AUTO_STEP) {
+    return GpuMat(size, type, reinterpret_cast<void*>(cudaMemoryAddress), step);
+}
+
 /** @brief BufferPool for use with CUDA streams
 
 BufferPool utilizes Stream's allocator to create new buffers for GpuMat's. It is
@@ -609,8 +662,8 @@ Below is an example that utilizes BufferPool with StackAllocator:
         GpuMat d_src2 = pool2.getBuffer(1024, 1024, CV_8UC1);   // 1MB
         GpuMat d_dst2 = pool2.getBuffer(1024, 1024, CV_8UC3);   // 3MB
 
-        cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
-        cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+        cvtColor(d_src1, d_dst1, cv::COLOR_GRAY2BGR, 0, stream1);
+        cvtColor(d_src2, d_dst2, cv::COLOR_GRAY2BGR, 0, stream2);
     }
 @endcode
 
@@ -675,8 +728,8 @@ and the corresponding memory is automatically returned to the pool for later usa
             d_src1.setTo(Scalar(i), stream1);
             d_src2.setTo(Scalar(i), stream2);
 
-            cvtColor(d_src1, d_dst1, CV_GRAY2BGR, 0, stream1);
-            cvtColor(d_src2, d_dst2, CV_GRAY2BGR, 0, stream2);
+            cvtColor(d_src1, d_dst1, cv::COLOR_GRAY2BGR, 0, stream1);
+            cvtColor(d_src2, d_dst2, cv::COLOR_GRAY2BGR, 0, stream2);
                                                                     // The order of destruction of the local variables is:
                                                                     //   d_dst2 => d_src2 => d_dst1 => d_src1
                                                                     // LIFO rule is satisfied, this code runs without error
@@ -694,8 +747,16 @@ class CV_EXPORTS_W BufferPool
     //! Allocates a new GpuMat of given size and type.
     CV_WRAP GpuMat getBuffer(int rows, int cols, int type);
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
     //! Allocates a new GpuMat of given size and type.
     CV_WRAP GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
     //! Returns the allocator associated with the stream.
     CV_WRAP Ptr<GpuMat::Allocator> getAllocator() const { return allocator_; }
@@ -913,6 +974,13 @@ class CV_EXPORTS_W Stream
     friend class DefaultDeviceInitializer;
 };
 
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CUDA Runtime API stream pointer (cudaStream_t).
+@param cudaStreamMemoryAddress Memory address stored in a CUDA Runtime API stream pointer (cudaStream_t). The created Stream object does not perform any allocation or deallocation and simply wraps existing raw CUDA Runtime API stream pointer.
+@note Overload for generation of bindings only, not exported or intended for use internally from C++.
+ */
+CV_EXPORTS_W Stream wrapStream(size_t cudaStreamMemoryAddress);
+
 class CV_EXPORTS_W Event
 {
 public:
diff --git a/3rdParty/opencv2/core/cuda.inl.hpp b/3rdParty/opencv2/core/cuda.inl.hpp
index 01d7ec0c84..9eae299806 100644
--- a/3rdParty/opencv2/core/cuda.inl.hpp
+++ b/3rdParty/opencv2/core/cuda.inl.hpp
@@ -75,6 +75,11 @@ GpuMat::GpuMat(Size size_, int type_, Allocator* allocator_)
         create(size_.height, size_.width, type_);
 }
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_, Allocator* allocator_)
     : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
@@ -96,6 +101,9 @@ GpuMat::GpuMat(Size size_, int type_, Scalar s_, Allocator* allocator_)
         setTo(s_);
     }
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 inline
 GpuMat::GpuMat(const GpuMat& m)
@@ -158,11 +166,19 @@ GpuMat GpuMat::clone() const
     return m;
 }
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 void GpuMat::copyTo(OutputArray dst, InputArray mask) const
 {
     copyTo(dst, mask, Stream::Null());
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 inline
 GpuMat& GpuMat::setTo(Scalar s)
@@ -176,6 +192,11 @@ GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
     return setTo(s, mask, Stream::Null());
 }
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 void GpuMat::convertTo(OutputArray dst, int rtype) const
 {
@@ -187,6 +208,9 @@ void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) co
 {
     convertTo(dst, rtype, alpha, beta, Stream::Null());
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 inline
 void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
@@ -646,6 +670,11 @@ Event::Event(const Ptr<Impl>& impl)
 // Initialization & Info
 //===================================================================================
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 bool TargetArchs::has(int major, int minor)
 {
@@ -663,6 +692,9 @@ DeviceInfo::DeviceInfo()
 {
     device_id_ = getDevice();
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 inline
 DeviceInfo::DeviceInfo(int device_id)
@@ -671,6 +703,11 @@ DeviceInfo::DeviceInfo(int device_id)
     device_id_ = device_id;
 }
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 int DeviceInfo::deviceID() const
 {
@@ -699,6 +736,9 @@ bool DeviceInfo::supports(FeatureSet feature_set) const
     int version = majorVersion() * 10 + minorVersion();
     return version >= feature_set;
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 
 }} // namespace cv { namespace cuda {
diff --git a/3rdParty/opencv2/core/cuda/common.hpp b/3rdParty/opencv2/core/cuda/common.hpp
index 7acb219555..58cf9d8579 100644
--- a/3rdParty/opencv2/core/cuda/common.hpp
+++ b/3rdParty/opencv2/core/cuda/common.hpp
@@ -65,8 +65,10 @@
 namespace cv { namespace cuda {
     static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func)
     {
-        if (cudaSuccess != err)
+        if (cudaSuccess != err) {
+            cudaGetLastError(); // reset the last stored error to cudaSuccess
             cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+        }
     }
 }}
 
@@ -96,6 +98,11 @@ namespace cv { namespace cuda
             return (total + grain - 1) / grain;
         }
 
+#if (CUDART_VERSION >= 12000)
+        template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t*, PtrStepSz<T>&, const cudaTextureDesc&) {
+            CV_Error(cv::Error::GpuNotSupported, "Function removed in CUDA SDK 12"); }
+#else
+        //TODO: remove from OpenCV 5.x
         template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
         {
             cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
@@ -115,6 +122,7 @@ namespace cv { namespace cuda
 
             cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
         }
+#endif
     }
 }}
 
diff --git a/3rdParty/opencv2/core/cuda/detail/reduce.hpp b/3rdParty/opencv2/core/cuda/detail/reduce.hpp
index 1bcb81ee76..507e6b334d 100644
--- a/3rdParty/opencv2/core/cuda/detail/reduce.hpp
+++ b/3rdParty/opencv2/core/cuda/detail/reduce.hpp
@@ -134,6 +134,22 @@ namespace cv { namespace cuda { namespace device
         {
             val = smem[tid];
         }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
         __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
@@ -142,6 +158,7 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
         }
+
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
         __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
@@ -151,18 +168,6 @@ namespace cv { namespace cuda { namespace device
             For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
         }
 
-        template <typename T, class Op>
-        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
-        {
-            T reg = smem[tid + delta];
-            smem[tid] = val = op(val, reg);
-        }
-        template <typename T, class Op>
-        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
-        {
-            T reg = shfl_down(val, delta, width);
-            val = op(val, reg);
-        }
         template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
                   typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
                   class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
@@ -183,7 +188,31 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
         }
+#else
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadToSmem(smem, val, tid);
+        }
 
+        template <typename... P, typename... R>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename... P, typename... R, class... Op>
+        __device__ __forceinline__ void merge(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, unsigned int delta, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P...> >::value>::merge(smem, val, tid, delta, op);
+        }
+
+        template <typename... R, class... Op>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R...>& val, unsigned int delta, unsigned int width, const thrust::tuple<Op...>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R...> >::value>::mergeShfl(val, delta, width, op);
+        }
+#endif
         template <unsigned int N> struct Generic
         {
             template <typename Pointer, typename Reference, class Op>
diff --git a/3rdParty/opencv2/core/cuda/detail/reduce_key_val.hpp b/3rdParty/opencv2/core/cuda/detail/reduce_key_val.hpp
index f90fbcd557..535f9d8ed0 100644
--- a/3rdParty/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/3rdParty/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -177,6 +177,8 @@ namespace cv { namespace cuda { namespace device
         {
             data = smem[tid];
         }
+
+#if (CUDART_VERSION < 12040)
         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
         __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
@@ -193,9 +195,18 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
         }
-
-        //////////////////////////////////////////////////////
-        // copyVals
+#else
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP...>& smem, const thrust::tuple<VR...>& data, unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::loadFromSmem(smem, data, tid);
+        }
+#endif
 
         template <typename V>
         __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
@@ -207,24 +218,6 @@ namespace cv { namespace cuda { namespace device
         {
             svals[tid] = val = svals[tid + delta];
         }
-        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
-        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
-                                                     unsigned int delta,
-                                                     int width)
-        {
-            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
-        }
-        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
-                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
-        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
-                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
-                                                 unsigned int tid, unsigned int delta)
-        {
-            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
-        }
-
-        //////////////////////////////////////////////////////
-        // merge
 
         template <typename K, typename V, class Cmp>
         __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
@@ -248,6 +241,24 @@ namespace cv { namespace cuda { namespace device
                 copyVals(svals, val, tid, delta);
             }
         }
+
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
         template <typename K,
                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
                   class Cmp>
@@ -305,7 +316,61 @@ namespace cv { namespace cuda { namespace device
         {
             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
         }
+#else
+        template <typename... VR>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR...>& val, unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR...> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename... VP, typename... VR>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::copy(svals, val, tid, delta);
+        }
+
+        template <typename K, typename... VR, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename... VP, typename... VR, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename... KR, typename... VR, class... Cmp>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR...>& key,
+                                                  const thrust::tuple<VR...>& val,
+                                                  const thrust::tuple<Cmp...>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR...> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP...>& skeys,
+                                              const thrust::tuple<KR...>& key,
+                                              const thrust::tuple<VP...>& svals,
+                                              const thrust::tuple<VR...>& val,
+                                              const thrust::tuple<Cmp...>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP...> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
 
+#endif
         //////////////////////////////////////////////////////
         // Generic
 
diff --git a/3rdParty/opencv2/core/cuda/reduce.hpp b/3rdParty/opencv2/core/cuda/reduce.hpp
index f164ef5bdd..f422b0d58d 100644
--- a/3rdParty/opencv2/core/cuda/reduce.hpp
+++ b/3rdParty/opencv2/core/cuda/reduce.hpp
@@ -64,6 +64,12 @@ namespace cv { namespace cuda { namespace device
     {
         reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
     }
+    template <unsigned int N, typename K, typename V, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+#if (CUDART_VERSION < 12040) // details: https://github.com/opencv/opencv_contrib/issues/3690
     template <int N,
               typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
               typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
@@ -79,11 +85,6 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
     }
 
-    template <unsigned int N, typename K, typename V, class Cmp>
-    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
-    {
-        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
-    }
     template <unsigned int N,
               typename K,
               typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
@@ -99,6 +100,7 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
                 const Cmp&>(skeys, key, svals, val, tid, cmp);
     }
+
     template <unsigned int N,
               typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
               typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
@@ -120,6 +122,25 @@ namespace cv { namespace cuda { namespace device
                 const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
                 >(skeys, key, svals, val, tid, cmp);
     }
+#else
+    template <int N, typename... P, typename... R, class... Op>
+    __device__ __forceinline__ void reduce(const thrust::tuple<P...>& smem, const thrust::tuple<R...>& val, unsigned int tid, const thrust::tuple<Op...>& op)
+    {
+        reduce_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<P...>&, const thrust::tuple<R...>&, const thrust::tuple<Op...>&>(smem, val, tid, op);
+    }
+
+    template <unsigned int N, typename K, typename... VP, typename... VR, class Cmp>
+    __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const Cmp& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const Cmp&>(skeys, key, svals, val, tid, cmp);
+    }
+
+    template <unsigned int N, typename... KP, typename... KR, typename... VP, typename... VR, class... Cmp>
+    __device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP...>& skeys, const thrust::tuple<KR...>& key, const thrust::tuple<VP...>& svals, const thrust::tuple<VR...>& val, unsigned int tid, const thrust::tuple<Cmp...>& cmp)
+    {
+        reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<const thrust::tuple<KP...>&, const thrust::tuple<KR...>&, const thrust::tuple<VP...>&, const thrust::tuple<VR...>&, const thrust::tuple<Cmp...>&>(skeys, key, svals, val, tid, cmp);
+    }
+#endif
 
     // smem_tuple
 
diff --git a/3rdParty/opencv2/core/cuda_types.hpp b/3rdParty/opencv2/core/cuda_types.hpp
index 269befec9f..73131b1d32 100644
--- a/3rdParty/opencv2/core/cuda_types.hpp
+++ b/3rdParty/opencv2/core/cuda_types.hpp
@@ -66,6 +66,9 @@
     #define __CV_CUDA_HOST_DEVICE__
 #endif
 
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core.hpp"
+
 namespace cv
 {
     namespace cuda
@@ -124,6 +127,11 @@ namespace cv
 
             int cols;
             int rows;
+
+            CV_NODISCARD_STD __CV_CUDA_HOST_DEVICE__ Size size() const { return {cols, rows}; }
+            CV_NODISCARD_STD __CV_CUDA_HOST_DEVICE__ T& operator ()(const Point &pos)       { return (*this)(pos.y, pos.x); }
+            CV_NODISCARD_STD __CV_CUDA_HOST_DEVICE__ const T& operator ()(const Point &pos) const { return (*this)(pos.y, pos.x); }
+            using PtrStep<T>::operator();
         };
 
         typedef PtrStepSz<unsigned char> PtrStepSzb;
diff --git a/3rdParty/opencv2/core/cv_cpu_dispatch.h b/3rdParty/opencv2/core/cv_cpu_dispatch.h
index 9cfd787318..fb4f4a133b 100644
--- a/3rdParty/opencv2/core/cv_cpu_dispatch.h
+++ b/3rdParty/opencv2/core/cv_cpu_dispatch.h
@@ -79,6 +79,10 @@
 #  endif
 #  define CV_FP16 1
 #endif
+#ifdef CV_CPU_COMPILE_NEON_DOTPROD
+#  include <arm_neon.h>
+#  define CV_NEON_DOT 1
+#endif
 #ifdef CV_CPU_COMPILE_AVX2
 #  include <immintrin.h>
 #  define CV_AVX2 1
@@ -137,18 +141,28 @@
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
 #endif
 
-#if defined(__riscv) && defined(__riscv_vector) && defined(__riscv_vector_071)
-# include<riscv-vector.h>
-# define CV_RVV071 1
-#endif
-
-#if defined(__ARM_NEON__) || defined(__aarch64__)
-#  include <arm_neon.h>
+/* RVV-related macro states with different compiler
+// +--------------------+----------+----------+
+// | Macro              | Upstream | XuanTie  |
+// +--------------------+----------+----------+
+// | CV_CPU_COMPILE_RVV | defined  | defined  |
+// | CV_RVV             | 1        | 0        |
+// | CV_RVV071          | 0        | 1        |
+// | CV_TRY_RVV         | 1        | 1        |
+// +--------------------+----------+----------+
+*/
+#ifdef CV_CPU_COMPILE_RVV
+#  ifdef __riscv_vector_071
+#    define CV_RVV071 1
+#  else
+#    define CV_RVV 1
+#  endif
+#include <riscv_vector.h>
 #endif
 
 #ifdef CV_CPU_COMPILE_VSX
@@ -168,16 +182,21 @@
 #  define CV_MSA 1
 #endif
 
+#ifdef CV_CPU_COMPILE_LSX
+#  include <lsxintrin.h>
+#  define CV_LSX 1
+#endif
+
+#ifdef CV_CPU_COMPILE_LASX
+#  include <lasxintrin.h>
+#  define CV_LASX 1
+#endif
+
 #ifdef __EMSCRIPTEN__
 #  define CV_WASM_SIMD 1
 #  include <wasm_simd128.h>
 #endif
 
-#if defined CV_CPU_COMPILE_RVV
-#  define CV_RVV 1
-#  include <riscv_vector.h>
-#endif
-
 #endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
 
 #if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@@ -215,7 +234,7 @@ struct VZeroUpperGuard {
 # include <Intrin.h>
 # include <arm_neon.h>
 # define CV_NEON 1
-#elif defined(__ARM_NEON__) || (defined (__ARM_NEON) && defined(__aarch64__))
+#elif defined(__ARM_NEON)
 #  include <arm_neon.h>
 #  define CV_NEON 1
 #elif defined(__VSX__) && defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
@@ -366,3 +385,11 @@ struct VZeroUpperGuard {
 #ifndef CV_RVV
 #  define CV_RVV 0
 #endif
+
+#ifndef CV_LSX
+#  define CV_LSX 0
+#endif
+
+#ifndef CV_LASX
+#  define CV_LASX 0
+#endif
diff --git a/3rdParty/opencv2/core/cv_cpu_helper.h b/3rdParty/opencv2/core/cv_cpu_helper.h
index 5c54f0b348..29952aec36 100644
--- a/3rdParty/opencv2/core/cv_cpu_helper.h
+++ b/3rdParty/opencv2/core/cv_cpu_helper.h
@@ -420,6 +420,69 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...)  CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 1
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
+#  define CV_TRY_NEON_DOTPROD 1
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
+#else
+#  define CV_TRY_NEON_DOTPROD 0
+#  define CV_CPU_FORCE_NEON_DOTPROD 0
+#  define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
+#  define CV_CPU_CALL_NEON_DOTPROD(fn, args)
+#  define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...)  CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 1
+#  define CV_CPU_CALL_NEON_FP16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) return (opt_NEON_FP16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_FP16
+#  define CV_TRY_NEON_FP16 1
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 (cv::checkHardwareSupport(CV_CPU_NEON_FP16))
+#  define CV_CPU_CALL_NEON_FP16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_FP16) return (opt_NEON_FP16::fn args)
+#else
+#  define CV_TRY_NEON_FP16 0
+#  define CV_CPU_FORCE_NEON_FP16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_FP16 0
+#  define CV_CPU_CALL_NEON_FP16(fn, args)
+#  define CV_CPU_CALL_NEON_FP16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_FP16(fn, args, mode, ...)  CV_CPU_CALL_NEON_FP16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 1
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 1
+#  define CV_CPU_CALL_NEON_BF16(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) return (opt_NEON_BF16::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_BF16
+#  define CV_TRY_NEON_BF16 1
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 (cv::checkHardwareSupport(CV_CPU_NEON_BF16))
+#  define CV_CPU_CALL_NEON_BF16(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_BF16) return (opt_NEON_BF16::fn args)
+#else
+#  define CV_TRY_NEON_BF16 0
+#  define CV_CPU_FORCE_NEON_BF16 0
+#  define CV_CPU_HAS_SUPPORT_NEON_BF16 0
+#  define CV_CPU_CALL_NEON_BF16(fn, args)
+#  define CV_CPU_CALL_NEON_BF16_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_NEON_BF16(fn, args, mode, ...)  CV_CPU_CALL_NEON_BF16(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
 #  define CV_TRY_MSA 1
 #  define CV_CPU_FORCE_MSA 1
@@ -504,5 +567,47 @@
 #endif
 #define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...)  CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
 
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 1
+#  define CV_CPU_HAS_SUPPORT_LSX 1
+#  define CV_CPU_CALL_LSX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) return (opt_LSX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LSX
+#  define CV_TRY_LSX 1
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX (cv::checkHardwareSupport(CV_CPU_LSX))
+#  define CV_CPU_CALL_LSX(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#  define CV_CPU_CALL_LSX_(fn, args) if (CV_CPU_HAS_SUPPORT_LSX) return (opt_LSX::fn args)
+#else
+#  define CV_TRY_LSX 0
+#  define CV_CPU_FORCE_LSX 0
+#  define CV_CPU_HAS_SUPPORT_LSX 0
+#  define CV_CPU_CALL_LSX(fn, args)
+#  define CV_CPU_CALL_LSX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LSX(fn, args, mode, ...)  CV_CPU_CALL_LSX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
+#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 1
+#  define CV_CPU_HAS_SUPPORT_LASX 1
+#  define CV_CPU_CALL_LASX(fn, args) return (cpu_baseline::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) return (opt_LASX::fn args)
+#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_LASX
+#  define CV_TRY_LASX 1
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX (cv::checkHardwareSupport(CV_CPU_LASX))
+#  define CV_CPU_CALL_LASX(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#  define CV_CPU_CALL_LASX_(fn, args) if (CV_CPU_HAS_SUPPORT_LASX) return (opt_LASX::fn args)
+#else
+#  define CV_TRY_LASX 0
+#  define CV_CPU_FORCE_LASX 0
+#  define CV_CPU_HAS_SUPPORT_LASX 0
+#  define CV_CPU_CALL_LASX(fn, args)
+#  define CV_CPU_CALL_LASX_(fn, args)
+#endif
+#define __CV_CPU_DISPATCH_CHAIN_LASX(fn, args, mode, ...)  CV_CPU_CALL_LASX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
+
 #define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
 #define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...)  CV_CPU_CALL_BASELINE(fn, args) /* last in sequence */
diff --git a/3rdParty/opencv2/core/cvdef.h b/3rdParty/opencv2/core/cvdef.h
index 51274637b6..002a60f4d2 100644
--- a/3rdParty/opencv2/core/cvdef.h
+++ b/3rdParty/opencv2/core/cvdef.h
@@ -201,6 +201,14 @@ namespace cv {
 #  define CV_ICC   __INTEL_COMPILER
 #endif
 
+#if defined _WIN32
+#  define CV_CDECL __cdecl
+#  define CV_STDCALL __stdcall
+#else
+#  define CV_CDECL
+#  define CV_STDCALL
+#endif
+
 #ifndef CV_INLINE
 #  if defined __cplusplus
 #    define CV_INLINE static inline
@@ -268,6 +276,9 @@ namespace cv {
 #define CV_CPU_AVX_5124FMAPS    27
 
 #define CV_CPU_NEON             100
+#define CV_CPU_NEON_DOTPROD     101
+#define CV_CPU_NEON_FP16        102
+#define CV_CPU_NEON_BF16        103
 
 #define CV_CPU_MSA              150
 
@@ -278,6 +289,9 @@ namespace cv {
 
 #define CV_CPU_RVV              210
 
+#define CV_CPU_LSX              230
+#define CV_CPU_LASX             231
+
 // CPU features groups
 #define CV_CPU_AVX512_SKX       256
 #define CV_CPU_AVX512_COMMON    257
@@ -324,6 +338,9 @@ enum CpuFeatures {
     CPU_AVX_5124FMAPS   = 27,
 
     CPU_NEON            = 100,
+    CPU_NEON_DOTPROD    = 101,
+    CPU_NEON_FP16       = 102,
+    CPU_NEON_BF16       = 103,
 
     CPU_MSA             = 150,
 
@@ -334,6 +351,9 @@ enum CpuFeatures {
 
     CPU_RVV             = 210,
 
+    CPU_LSX             = 230,
+    CPU_LASX            = 231,
+
     CPU_AVX512_SKX      = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
     CPU_AVX512_COMMON   = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
     CPU_AVX512_KNL      = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
@@ -453,20 +473,25 @@ Cv64suf;
 #define CV_EXPORTS_W_SIMPLE CV_EXPORTS
 #define CV_EXPORTS_AS(synonym) CV_EXPORTS
 #define CV_EXPORTS_W_MAP CV_EXPORTS
+#define CV_EXPORTS_W_PARAMS CV_EXPORTS
 #define CV_IN_OUT
 #define CV_OUT
 #define CV_PROP
 #define CV_PROP_RW
+#define CV_ND // Indicates that input data should be parsed into Mat without channels
 #define CV_WRAP
 #define CV_WRAP_AS(synonym)
 #define CV_WRAP_MAPPABLE(mappable)
 #define CV_WRAP_PHANTOM(phantom_header)
 #define CV_WRAP_DEFAULT(val)
+/* Indicates that the function parameter has filesystem path semantic */
+#define CV_WRAP_FILE_PATH
 
 /****************************************************************************************\
 *                                  Matrix type (Mat)                                     *
 \****************************************************************************************/
 
+#define CV_MAX_DIM              32
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
 #define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
@@ -493,6 +518,13 @@ Cv64suf;
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
+/** min & max without jumps */
+#define CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
+#define CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
+#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
+#define CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
+#define CV_SIGN(a)     CV_CMP((a),0)
+
 ///////////////////////////////////////// Enum operators ///////////////////////////////////////
 
 /**
@@ -715,7 +747,11 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 #    define __has_cpp_attribute(__x) 0
 #  endif
 #  if __has_cpp_attribute(nodiscard)
-#    define CV_NODISCARD_STD [[nodiscard]]
+#    if defined(__NVCC__) && __CUDACC_VER_MAJOR__ < 12
+#       define CV_NODISCARD_STD
+#    else
+#       define CV_NODISCARD_STD [[nodiscard]]
+#    endif
 #  elif __cplusplus >= 201703L
 //   available when compiler is C++17 compliant
 #    define CV_NODISCARD_STD [[nodiscard]]
@@ -738,88 +774,43 @@ __CV_ENUM_FLAGS_BITWISE_XOR_EQ   (EnumType, EnumType)
 
 
 /****************************************************************************************\
-*                      CV_NODISCARD attribute (deprecated, GCC only)                     *
-* DONT USE: use instead the standard CV_NODISCARD_STD macro above                        *
-*           this legacy method silently fails to issue warning until some version        *
-*           after gcc 6.3.0. Yet with gcc 7+ you can use the above standard method       *
-*           which makes this method useless. Don't use it.                               *
-* @deprecated use instead CV_NODISCARD_STD                                               *
+*                                    C++ 11                                              *
 \****************************************************************************************/
-#ifndef CV_NODISCARD
-#  if defined(__GNUC__)
-#    define CV_NODISCARD __attribute__((__warn_unused_result__))
-#  elif defined(__clang__) && defined(__has_attribute)
-#    if __has_attribute(__warn_unused_result__)
-#      define CV_NODISCARD __attribute__((__warn_unused_result__))
+#ifdef __cplusplus
+// MSVC was stuck at __cplusplus == 199711L for a long time, even where it supports C++11,
+// so check _MSC_VER instead. See:
+// <https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus>
+#  if defined(_MSC_VER)
+#    if _MSC_VER < 1800
+#      error "OpenCV 4.x+ requires enabled C++11 support"
 #    endif
+#  elif __cplusplus < 201103L
+#    error "OpenCV 4.x+ requires enabled C++11 support"
 #  endif
 #endif
-#ifndef CV_NODISCARD
-#  define CV_NODISCARD /* nothing by default */
-#endif
-
 
-/****************************************************************************************\
-*                                    C++ 11                                              *
-\****************************************************************************************/
-#ifndef CV_CXX11
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1800)
-#    define CV_CXX11 1
-#  endif
-#else
-#  if CV_CXX11 == 0
-#    undef CV_CXX11
-#  endif
-#endif
 #ifndef CV_CXX11
-#  error "OpenCV 4.x+ requires enabled C++11 support"
+#  define CV_CXX11 1
 #endif
 
-#define CV_CXX_MOVE_SEMANTICS 1
-#define CV_CXX_MOVE(x) std::move(x)
-#define CV_CXX_STD_ARRAY 1
-#include <array>
 #ifndef CV_OVERRIDE
 #  define CV_OVERRIDE override
 #endif
+
 #ifndef CV_FINAL
 #  define CV_FINAL final
 #endif
 
 #ifndef CV_NOEXCEPT
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_NOEXCEPT noexcept
-#  endif
-#endif
-#ifndef CV_NOEXCEPT
-#  define CV_NOEXCEPT
+#  define CV_NOEXCEPT noexcept
 #endif
 
 #ifndef CV_CONSTEXPR
-#  if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
-#    define CV_CONSTEXPR constexpr
-#  endif
-#endif
-#ifndef CV_CONSTEXPR
-#  define CV_CONSTEXPR
+#  define CV_CONSTEXPR constexpr
 #endif
 
-// Integer types portatibility
-#ifdef OPENCV_STDINT_HEADER
-#include OPENCV_STDINT_HEADER
-#elif defined(__cplusplus)
-#if defined(_MSC_VER) && _MSC_VER < 1600 /* MSVS 2010 */
-namespace cv {
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef signed short int16_t;
-typedef unsigned short uint16_t;
-typedef signed int int32_t;
-typedef unsigned int uint32_t;
-typedef signed __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-}
-#elif defined(_MSC_VER) || __cplusplus >= 201103L
+// Integer types portability
+#ifdef __cplusplus
 #include <cstdint>
 namespace cv {
 using std::int8_t;
@@ -831,19 +822,6 @@ using std::uint32_t;
 using std::int64_t;
 using std::uint64_t;
 }
-#else
-#include <stdint.h>
-namespace cv {
-typedef ::int8_t int8_t;
-typedef ::uint8_t uint8_t;
-typedef ::int16_t int16_t;
-typedef ::uint16_t uint16_t;
-typedef ::int32_t int32_t;
-typedef ::uint32_t uint32_t;
-typedef ::int64_t int64_t;
-typedef ::uint64_t uint64_t;
-}
-#endif
 #else // pure C
 #include <stdint.h>
 #endif
@@ -852,42 +830,22 @@ typedef ::uint64_t uint64_t;
 namespace cv
 {
 
-class float16_t
+class hfloat
 {
 public:
 #if CV_FP16_TYPE
 
-    float16_t() : h(0) {}
-    explicit float16_t(float x) { h = (__fp16)x; }
+    hfloat() : h(0) {}
+    explicit hfloat(float x) { h = (__fp16)x; }
     operator float() const { return (float)h; }
-    static float16_t fromBits(ushort w)
-    {
-        Cv16suf u;
-        u.u = w;
-        float16_t result;
-        result.h = u.h;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.h = (__fp16)0;
-        return result;
-    }
-    ushort bits() const
-    {
-        Cv16suf u;
-        u.h = h;
-        return u.u;
-    }
 protected:
     __fp16 h;
 
 #else
-    float16_t() : w(0) {}
-    explicit float16_t(float x)
+    hfloat() : w(0) {}
+    explicit hfloat(float x)
     {
-    #if CV_FP16
+    #if CV_FP16 && CV_AVX2
         __m128 v = _mm_load_ss(&x);
         w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
     #else
@@ -918,7 +876,7 @@ class float16_t
 
     operator float() const
     {
-    #if CV_FP16
+    #if CV_FP16 && CV_AVX2
         float f;
         _mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
         return f;
@@ -936,28 +894,51 @@ class float16_t
     #endif
     }
 
-    static float16_t fromBits(ushort b)
-    {
-        float16_t result;
-        result.w = b;
-        return result;
-    }
-    static float16_t zero()
-    {
-        float16_t result;
-        result.w = (ushort)0;
-        return result;
-    }
-    ushort bits() const { return w; }
 protected:
     ushort w;
 
 #endif
 };
 
+inline hfloat hfloatFromBits(ushort w) {
+#if CV_FP16_TYPE
+    Cv16suf u;
+    u.u = w;
+    hfloat res(float(u.h));
+    return res;
+#else
+    Cv32suf out;
+
+    unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
+    unsigned sign = (w & 0x8000) << 16;
+    unsigned e = w & 0x7c00;
+
+    out.u = t + (1 << 23);
+    out.u = (e >= 0x7c00 ? t + 0x38000000 :
+            e == 0 ? (static_cast<void>(out.f -= 6.103515625e-05f), out.u) : t) | sign;
+    hfloat res(out.f);
+    return res;
+#endif
 }
+
+#if !defined(__OPENCV_BUILD) && !(defined __STDCPP_FLOAT16_T__) && !(defined __ARM_NEON)
+typedef hfloat float16_t;
 #endif
 
+}
+#endif
+
+/** @brief Constructs the 'fourcc' code, used in video codecs and many other places.
+    Simply call it with 4 chars like `CV_FOURCC('I', 'Y', 'U', 'V')`
+*/
+CV_INLINE int CV_FOURCC(char c1, char c2, char c3, char c4)
+{
+    return (c1 & 255) + ((c2 & 255) << 8) + ((c3 & 255) << 16) + ((c4 & 255) << 24);
+}
+
+//! Macro to construct the fourcc code of the codec. Same as CV_FOURCC()
+#define CV_FOURCC_MACRO(c1, c2, c3, c4) (((c1) & 255) + (((c2) & 255) << 8) + (((c3) & 255) << 16) + (((c4) & 255) << 24))
+
 //! @}
 
 #ifndef __cplusplus
diff --git a/3rdParty/opencv2/core/cvstd.hpp b/3rdParty/opencv2/core/cvstd.hpp
index 70fb97d8dd..1946f2521f 100644
--- a/3rdParty/opencv2/core/cvstd.hpp
+++ b/3rdParty/opencv2/core/cvstd.hpp
@@ -140,7 +140,6 @@ template<typename _Tp> class Allocator
 
 //! @} core_utils
 
-//! @endcond
 
 //! @addtogroup core_basic
 //! @{
diff --git a/3rdParty/opencv2/core/detail/async_promise.hpp b/3rdParty/opencv2/core/detail/async_promise.hpp
index 113e494e94..811a24324f 100644
--- a/3rdParty/opencv2/core/detail/async_promise.hpp
+++ b/3rdParty/opencv2/core/detail/async_promise.hpp
@@ -52,10 +52,8 @@ class CV_EXPORTS AsyncPromise
     */
     void setException(const cv::Exception& exception);
 
-#ifdef CV_CXX11
     explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
     AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
-#endif
 
 
     // PImpl
diff --git a/3rdParty/opencv2/core/detail/exception_ptr.hpp b/3rdParty/opencv2/core/detail/exception_ptr.hpp
index d5996bb066..ed5c40f24f 100644
--- a/3rdParty/opencv2/core/detail/exception_ptr.hpp
+++ b/3rdParty/opencv2/core/detail/exception_ptr.hpp
@@ -8,14 +8,8 @@
 #ifndef CV__EXCEPTION_PTR
 #  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
 #    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
-#  elif defined(CV_CXX11)
+#  else
 #    define CV__EXCEPTION_PTR 1
-#  elif defined(_MSC_VER)
-#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
-#  elif defined(__clang__)
-#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
-#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
-#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
 #  endif
 #endif
 #ifndef CV__EXCEPTION_PTR
diff --git a/3rdParty/opencv2/core/dualquaternion.hpp b/3rdParty/opencv2/core/dualquaternion.hpp
index 788403179c..990ca101c6 100644
--- a/3rdParty/opencv2/core/dualquaternion.hpp
+++ b/3rdParty/opencv2/core/dualquaternion.hpp
@@ -30,7 +30,7 @@
 #include <opencv2/core/affine.hpp>
 
 namespace cv{
-//! @addtogroup core
+//! @addtogroup core_quaternion
 //! @{
 
 template <typename _Tp> class DualQuat;
diff --git a/3rdParty/opencv2/core/dualquaternion.inl.hpp b/3rdParty/opencv2/core/dualquaternion.inl.hpp
index 22cc67c12b..6d3d1f0f2e 100644
--- a/3rdParty/opencv2/core/dualquaternion.inl.hpp
+++ b/3rdParty/opencv2/core/dualquaternion.inl.hpp
@@ -36,15 +36,15 @@
 namespace cv {
 
 template <typename T>
-DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){};
+DualQuat<T>::DualQuat():w(0), x(0), y(0), z(0), w_(0), x_(0), y_(0), z_(0){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const T vw, const T vx, const T vy, const T vz, const T _w, const T _x, const T _y, const T _z):
-                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){};
+                      w(vw), x(vx), y(vy), z(vz), w_(_w), x_(_x), y_(_y), z_(_z){}
 
 template <typename T>
 DualQuat<T>::DualQuat(const Vec<T, 8> &q):w(q[0]), x(q[1]), y(q[2]), z(q[3]),
-                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){};
+                                          w_(q[4]), x_(q[5]), y_(q[6]), z_(q[7]){}
 
 template <typename T>
 DualQuat<T> DualQuat<T>::createFromQuat(const Quat<T> &realPart, const Quat<T> &dualPart)
diff --git a/3rdParty/opencv2/core/eigen.hpp b/3rdParty/opencv2/core/eigen.hpp
index 5e87988c54..ad3c84225f 100644
--- a/3rdParty/opencv2/core/eigen.hpp
+++ b/3rdParty/opencv2/core/eigen.hpp
@@ -52,15 +52,16 @@
 #include "opencv2/core.hpp"
 
 #if defined _MSC_VER && _MSC_VER >= 1200
+#ifndef NOMINMAX
 #define NOMINMAX // fix https://github.com/opencv/opencv/issues/17548
+#endif
 #pragma warning( disable: 4714 ) //__forceinline is not inlined
 #pragma warning( disable: 4127 ) //conditional expression is constant
 #pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
 #endif
 
 #if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
-#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
-    && defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
+#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
 #include <unsupported/Eigen/CXX11/Tensor>
 #define OPENCV_EIGEN_TENSOR_SUPPORT 1
 #endif  // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
@@ -286,6 +287,17 @@ void cv2eigen( const Mat& src,
     }
 }
 
+template<typename _Tp>  static inline
+void cv2eigen( const Mat& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>& dst )
+{
+    CV_CheckEQ(src.dims, 2, "");
+    dst.resize(src.rows, src.cols);
+    const Mat _dst(src.rows, src.cols, traits::Type<_Tp>::value,
+             dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+    src.convertTo(_dst, _dst.type());
+}
+
 // Matx case
 template<typename _Tp, int _rows, int _cols> static inline
 void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
@@ -306,6 +318,17 @@ void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
     }
 }
 
+template<typename _Tp, int _rows, int _cols> static inline
+void cv2eigen( const Matx<_Tp, _rows, _cols>& src,
+               Eigen::Matrix<_Tp, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>& dst )
+{
+    CV_CheckEQ(src.dims, 2, "");
+    dst.resize(_rows, _cols);
+    const Mat _dst(_rows, _cols, traits::Type<_Tp>::value,
+                   dst.data(), (size_t)(dst.outerStride()*sizeof(_Tp)));
+    Mat(src).copyTo(_dst);
+}
+
 template<typename _Tp> static inline
 void cv2eigen( const Mat& src,
                Eigen::Matrix<_Tp, Eigen::Dynamic, 1>& dst )
diff --git a/3rdParty/opencv2/core/fast_math.hpp b/3rdParty/opencv2/core/fast_math.hpp
index 9f0f1f9b9b..3a6f8163fe 100644
--- a/3rdParty/opencv2/core/fast_math.hpp
+++ b/3rdParty/opencv2/core/fast_math.hpp
@@ -68,7 +68,7 @@
   // nothing, intrinsics/asm code is not supported
 #else
   #if ((defined _MSC_VER && defined _M_X64) \
-      || (defined __GNUC__ && defined __x86_64__ && defined __SSE2__)) \
+      || (defined __GNUC__ && defined __SSE2__)) \
       && !defined(OPENCV_SKIP_INCLUDE_EMMINTRIN_H)
     #include <emmintrin.h>
   #endif
@@ -84,7 +84,7 @@
   #if defined(CV_INLINE_ROUND_FLT)
     // user-specified version
     // CV_INLINE_ROUND_DBL should be defined too
-  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
+  #elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON) && !defined __SOFTFP__
     // 1. general scheme
     #define ARM_ROUND(_value, _asm_string) \
         int res; \
@@ -201,9 +201,7 @@ cvRound( double value )
 {
 #if defined CV_INLINE_ROUND_DBL
     CV_INLINE_ROUND_DBL(value);
-#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
-    && !defined(__CUDACC__)
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
     __m128d t = _mm_set_sd( value );
     return _mm_cvtsd_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -214,12 +212,11 @@ cvRound( double value )
         fistp t;
     }
     return t;
-#elif defined CV_ICC || defined __GNUC__
-    return (int)(lrint(value));
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrint(value);
 #else
-    /* it's ok if round does not comply with IEEE754 standard;
-       the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5 : -0.5));
+    return (int)lrint(value);
 #endif
 }
 
@@ -233,11 +230,18 @@ cvRound( double value )
  */
 CV_INLINE int cvFloor( double value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_floor(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floor(value);
+#elif defined __loongarch64
+    int i;
+    double tmp;
+    __asm__ ("ftintrm.l.d     %[tmp],    %[in]       \n\t"
+             "movfr2gr.d      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
 #else
     int i = (int)value;
     return i - (i > value);
@@ -253,11 +257,18 @@ CV_INLINE int cvFloor( double value )
  */
 CV_INLINE int cvCeil( double value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_ceil(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceil(value);
+#elif defined __loongarch64
+    int i;
+    double tmp;
+    __asm__ ("ftintrp.l.d     %[tmp],    %[in]       \n\t"
+             "movfr2gr.d      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
 #else
     int i = (int)value;
     return i + (i < value);
@@ -292,10 +303,10 @@ CV_INLINE int cvIsInf( double value )
 {
 #if defined CV_INLINE_ISINF_DBL
     CV_INLINE_ISINF_DBL(value);
-#elif defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__PPC64__)
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__PPC64__) || defined(__loongarch64)
     Cv64suf ieee754;
     ieee754.f = value;
-    return (ieee754.u & 0x7fffffff00000000) ==
+    return (ieee754.u & 0x7fffffffffffffff) ==
                         0x7ff0000000000000;
 #else
     Cv64suf ieee754;
@@ -312,9 +323,7 @@ CV_INLINE int cvRound(float value)
 {
 #if defined CV_INLINE_ROUND_FLT
     CV_INLINE_ROUND_FLT(value);
-#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
-    && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
-    && !defined(__CUDACC__)
+#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __SSE2__)) && !defined(__CUDACC__)
     __m128 t = _mm_set_ss( value );
     return _mm_cvtss_si32(t);
 #elif defined _MSC_VER && defined _M_IX86
@@ -325,12 +334,11 @@ CV_INLINE int cvRound(float value)
         fistp t;
     }
     return t;
-#elif defined CV_ICC || defined __GNUC__
-    return (int)(lrintf(value));
+#elif defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+      defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_lrintf(value);
 #else
-    /* it's ok if round does not comply with IEEE754 standard;
-     the tests should allow +/-1 difference when the tested functions use round */
-    return (int)(value + (value >= 0 ? 0.5f : -0.5f));
+    return (int)lrintf(value);
 #endif
 }
 
@@ -343,11 +351,18 @@ CV_INLINE int cvRound( int value )
 /** @overload */
 CV_INLINE int cvFloor( float value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_floorf(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_floorf(value);
+#elif defined __loongarch__
+    int i;
+    float tmp;
+    __asm__ ("ftintrm.w.s     %[tmp],    %[in]       \n\t"
+             "movfr2gr.s      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
 #else
     int i = (int)value;
     return i - (i > value);
@@ -363,11 +378,18 @@ CV_INLINE int cvFloor( int value )
 /** @overload */
 CV_INLINE int cvCeil( float value )
 {
-#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
-    && ( \
-        defined(__PPC64__) \
-    )
-    return __builtin_ceilf(value);
+#if defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || \
+    defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
+    return (int)__builtin_ceilf(value);
+#elif defined __loongarch__
+    int i;
+    float tmp;
+    __asm__ ("ftintrp.w.s     %[tmp],    %[in]       \n\t"
+             "movfr2gr.s      %[i],      %[tmp]      \n\t"
+             : [i] "=r" (i), [tmp] "=f" (tmp)
+             : [in] "f" (value)
+             :);
+    return i;
 #else
     int i = (int)value;
     return i + (i < value);
diff --git a/3rdParty/opencv2/core/hal/hal.hpp b/3rdParty/opencv2/core/hal/hal.hpp
index cdc1286891..34505515c0 100644
--- a/3rdParty/opencv2/core/hal/hal.hpp
+++ b/3rdParty/opencv2/core/hal/hal.hpp
@@ -91,10 +91,14 @@ CV_EXPORTS void exp64f(const double* src, double* dst, int n);
 CV_EXPORTS void log32f(const float* src, float* dst, int n);
 CV_EXPORTS void log64f(const double* src, double* dst, int n);
 
+CV_EXPORTS void cartToPolar32f(const float* x, const float* y, float* mag, float* angle, int n, bool angleInDegrees);
+CV_EXPORTS void cartToPolar64f(const double* x, const double* y, double* mag, double* angle, int n, bool angleInDegrees);
 CV_EXPORTS void fastAtan32f(const float* y, const float* x, float* dst, int n, bool angleInDegrees);
 CV_EXPORTS void fastAtan64f(const double* y, const double* x, double* dst, int n, bool angleInDegrees);
 CV_EXPORTS void magnitude32f(const float* x, const float* y, float* dst, int n);
 CV_EXPORTS void magnitude64f(const double* x, const double* y, double* dst, int n);
+CV_EXPORTS void polarToCart32f(const float* mag, const float* angle, float* x, float* y, int n, bool angleInDegrees);
+CV_EXPORTS void polarToCart64f(const double* mag, const double* angle, double* x, double* y, int n, bool angleInDegrees);
 CV_EXPORTS void sqrt32f(const float* src, float* dst, int len);
 CV_EXPORTS void sqrt64f(const double* src, double* dst, int len);
 CV_EXPORTS void invSqrt32f(const float* src, float* dst, int len);
@@ -195,8 +199,8 @@ CV_EXPORTS void addWeighted32s( const int* src1, size_t step1, const int* src2,
 CV_EXPORTS void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
 CV_EXPORTS void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 
-CV_EXPORTS void cvt16f32f( const float16_t* src, float* dst, int len );
-CV_EXPORTS void cvt32f16f( const float* src, float16_t* dst, int len );
+CV_EXPORTS void cvt16f32f( const hfloat* src, float* dst, int len );
+CV_EXPORTS void cvt32f16f( const float* src, hfloat* dst, int len );
 
 CV_EXPORTS void addRNGBias32f( float* arr, const float* scaleBiasPairs, int len );
 CV_EXPORTS void addRNGBias64f( double* arr, const double* scaleBiasPairs, int len );
diff --git a/3rdParty/opencv2/core/hal/intrin.hpp b/3rdParty/opencv2/core/hal/intrin.hpp
index 52563f44f6..afb74dc953 100644
--- a/3rdParty/opencv2/core/hal/intrin.hpp
+++ b/3rdParty/opencv2/core/hal/intrin.hpp
@@ -50,6 +50,12 @@
 #include <stdlib.h>
 #include "opencv2/core/cvdef.h"
 
+#if defined(__GNUC__) && __GNUC__ == 12
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 #define OPENCV_HAL_ADD(a, b) ((a) + (b))
 #define OPENCV_HAL_AND(a, b) ((a) & (b))
 #define OPENCV_HAL_NOP(a) (a)
@@ -58,7 +64,7 @@
 namespace {
 inline unsigned int trailingZeros32(unsigned int value) {
 #if defined(_MSC_VER)
-#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
+#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
     unsigned long index = 0;
     _BitScanForward(&index, value);
     return (unsigned int)index;
@@ -185,6 +191,19 @@ CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double)
 #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+template <typename _VecTp> inline _VecTp v_setzero_();
+template <typename _VecTp> inline _VecTp v_setall_(uchar);
+template <typename _VecTp> inline _VecTp v_setall_(schar);
+template <typename _VecTp> inline _VecTp v_setall_(ushort);
+template <typename _VecTp> inline _VecTp v_setall_(short);
+template <typename _VecTp> inline _VecTp v_setall_(unsigned);
+template <typename _VecTp> inline _VecTp v_setall_(int);
+template <typename _VecTp> inline _VecTp v_setall_(uint64);
+template <typename _VecTp> inline _VecTp v_setall_(int64);
+template <typename _VecTp> inline _VecTp v_setall_(float);
+template <typename _VecTp> inline _VecTp v_setall_(double);
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #endif
@@ -200,7 +219,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #   undef CV_RVV
 #endif
 
-#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_RVV) && !defined(CV_FORCE_SIMD128_CPP)
+#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
 #define CV__SIMD_FORWARD 128
 #include "opencv2/core/hal/intrin_forward.hpp"
 #endif
@@ -230,7 +249,11 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 #include "opencv2/core/hal/intrin_wasm.hpp"
 
 #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
-#include "opencv2/core/hal/intrin_rvv.hpp"
+#include "opencv2/core/hal/intrin_rvv_scalable.hpp"
+
+#elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
+
+#include "opencv2/core/hal/intrin_lsx.hpp"
 
 #else
 
@@ -266,6 +289,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
 
 #endif
 
+#if CV_LASX
+
+#define CV__SIMD_FORWARD 256
+#include "opencv2/core/hal/intrin_forward.hpp"
+#include "opencv2/core/hal/intrin_lasx.hpp"
+
+#endif
+
 //! @cond IGNORED
 
 namespace cv {
@@ -314,6 +345,14 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_SIMD512_FP16 0
 #endif
 
+#ifndef CV_SIMD_SCALABLE
+#define CV_SIMD_SCALABLE 0
+#endif
+
+#ifndef CV_SIMD_SCALABLE_64F
+#define CV_SIMD_SCALABLE_64F 0
+#endif
+
 //==================================================================================================
 
 template<typename _Tp> struct V_RegTraits
@@ -375,6 +414,18 @@ template<typename _Tp> struct V_RegTraits
     CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
     CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
 #endif
+#if CV_SIMD_SCALABLE
+    CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
+    CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
+    CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
+    CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
+    CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
+    CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
+#endif
 //! @endcond
 
 #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
@@ -488,6 +539,22 @@ namespace CV__SIMD_NAMESPACE {
     #define VXPREFIX(func) v##func
 } // namespace
 using namespace CV__SIMD_NAMESPACE;
+
+#elif CV_SIMD_SCALABLE
+#define CV__SIMD_NAMESPACE simd
+namespace CV__SIMD_NAMESPACE {
+    #define CV_SIMD 0
+    #define CV_SIMD_WIDTH 128  /* 1024/8 */
+
+    #define VXPREFIX(func) v##func
+} // namespace
+using namespace CV__SIMD_NAMESPACE;
+
+#endif
+
+//! @cond IGNORED
+#ifndef CV_SIMD_64F
+#define CV_SIMD_64F 0
 #endif
 
 namespace CV__SIMD_NAMESPACE {
@@ -505,7 +572,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
     inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
     inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
 #endif
     //! @}
@@ -522,7 +589,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
     inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
     inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
 #endif
     //! @}
@@ -539,7 +606,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
     inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
     inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
 #endif
     //! @}
@@ -556,7 +623,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
     inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
     inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
 #endif
     //! @}
@@ -573,7 +640,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
     inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
     inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
 #endif
     //! @}
@@ -590,7 +657,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
     inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
     inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
 #endif
     //! @}
@@ -607,7 +674,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
     inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
     inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
 #endif
     //! @}
@@ -624,7 +691,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
     inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
     inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
-#if CV_SIMD_64F
+#if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
     inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
 #endif
     //! @}
@@ -650,7 +717,7 @@ namespace CV__SIMD_NAMESPACE {
     inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
     inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
-    inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
+    inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
     //! @}
 
     //! @name Wide load with quad expansion
@@ -663,6 +730,221 @@ namespace CV__SIMD_NAMESPACE {
     /** @brief SIMD processing state cleanup call */
     inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
 
+#if !CV_SIMD_SCALABLE
+    // Compatibility layer
+#if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
+    template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+    };
+
+    //////////// get0 ////////////
+    #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
+    { \
+        return v.get0(); \
+    }
+
+    OPENCV_HAL_WRAP_GRT0(v_uint8)
+    OPENCV_HAL_WRAP_GRT0(v_int8)
+    OPENCV_HAL_WRAP_GRT0(v_uint16)
+    OPENCV_HAL_WRAP_GRT0(v_int16)
+    OPENCV_HAL_WRAP_GRT0(v_uint32)
+    OPENCV_HAL_WRAP_GRT0(v_int32)
+    OPENCV_HAL_WRAP_GRT0(v_uint64)
+    OPENCV_HAL_WRAP_GRT0(v_int64)
+    OPENCV_HAL_WRAP_GRT0(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_GRT0(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_GRT0(v_uint8x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x4)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x2)
+        OPENCV_HAL_WRAP_GRT0(v_int8x16)
+        OPENCV_HAL_WRAP_GRT0(v_int16x8)
+        OPENCV_HAL_WRAP_GRT0(v_int32x4)
+        OPENCV_HAL_WRAP_GRT0(v_int64x2)
+        OPENCV_HAL_WRAP_GRT0(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_GRT0(v_uint8x32)
+        OPENCV_HAL_WRAP_GRT0(v_uint16x16)
+        OPENCV_HAL_WRAP_GRT0(v_uint32x8)
+        OPENCV_HAL_WRAP_GRT0(v_uint64x4)
+        OPENCV_HAL_WRAP_GRT0(v_int8x32)
+        OPENCV_HAL_WRAP_GRT0(v_int16x16)
+        OPENCV_HAL_WRAP_GRT0(v_int32x8)
+        OPENCV_HAL_WRAP_GRT0(v_int64x4)
+        OPENCV_HAL_WRAP_GRT0(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_GRT0(v_float64x4)
+        #endif
+    #endif
+#endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
+        return v_add(v_add(f1, f2), f3, vf...); \
+    }
+
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+    // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+    // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
+    template<typename... Args> \
+    inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
+        return v_mul(v_mul(f1, f2), f3, vf...); \
+    }
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
+    inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
+    { \
+        return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_EXTRACT(v_uint8)
+    OPENCV_HAL_WRAP_EXTRACT(v_int8)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint16)
+    OPENCV_HAL_WRAP_EXTRACT(v_int16)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint32)
+    OPENCV_HAL_WRAP_EXTRACT(v_int32)
+    OPENCV_HAL_WRAP_EXTRACT(v_uint64)
+    OPENCV_HAL_WRAP_EXTRACT(v_int64)
+    OPENCV_HAL_WRAP_EXTRACT(v_float32)
+    #if CV_SIMD_64F
+    OPENCV_HAL_WRAP_EXTRACT(v_float64)
+    #endif
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
+        #endif
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
+        OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
+        OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
+        OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
+        OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
+        #if CV_SIMD_64F
+        OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
+        #endif
+    #endif
+
+    #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
+    inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
+    { \
+        return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
+    }
+
+    OPENCV_HAL_WRAP_BROADCAST(v_uint32)
+    OPENCV_HAL_WRAP_BROADCAST(v_int32)
+    OPENCV_HAL_WRAP_BROADCAST(v_float32)
+    #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
+    #endif
+    #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
+        OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
+        OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
+    #endif
+
+#endif //!CV_SIMD_SCALABLE
 
 //! @cond IGNORED
 
@@ -680,10 +962,6 @@ namespace CV__SIMD_NAMESPACE {
     #undef VXPREFIX
 } // namespace
 
-//! @cond IGNORED
-#ifndef CV_SIMD_64F
-#define CV_SIMD_64F 0
-#endif
 
 #ifndef CV_SIMD_FP16
 #define CV_SIMD_FP16 0  //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
@@ -703,4 +981,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
+#if defined(__GNUC__) && __GNUC__ == 12
+#pragma GCC diagnostic pop
+#endif
+
 #endif
diff --git a/3rdParty/opencv2/core/hal/intrin_avx.hpp b/3rdParty/opencv2/core/hal/intrin_avx.hpp
index 058b06ca9e..f9a58ccd77 100644
--- a/3rdParty/opencv2/core/hal/intrin_avx.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_avx.hpp
@@ -447,6 +447,10 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
     { return _Tpvec(_mm256_setzero_si256()); }                                   \
     inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
     { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
+    template <> inline _Tpvec v_setzero_()                                       \
+    { return v256_setzero_##suffix(); }                                          \
+    template <> inline _Tpvec v_setall_(_Tp v)                                   \
+    { return v256_setall_##suffix(v); }                                          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
@@ -472,6 +476,10 @@ OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
     { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
     inline _Tpvec v256_setall_##suffix(_Tp v)                            \
     { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
+    template <> inline _Tpvec v_setzero_()                               \
+    { return v256_setzero_##suffix(); }                                  \
+    template <> inline _Tpvec v_setall_(_Tp v)                           \
+    { return v256_setall_##suffix(v); }                                  \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
@@ -673,53 +681,51 @@ OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
 
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
-    { return _Tpvec(intrin(a.val, b.val)); }                          \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
-    { a.val = intrin(a.val, b.val); return a; }
-
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
-
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
-OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
-OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint8x32,  _mm256_adds_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint8x32,  _mm256_subs_epu8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int8x32,   _mm256_adds_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int8x32,   _mm256_subs_epi8)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint16x16, _mm256_adds_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint16x16, _mm256_subs_epu16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int16x16,  _mm256_adds_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int16x16,  _mm256_subs_epi16)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint32x8,  _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint32x8,  _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_uint32x8,  _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int32x8,   _mm256_add_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int32x8,   _mm256_sub_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_int32x8,   _mm256_mullo_epi32)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_uint64x4,  _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_uint64x4,  _mm256_sub_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_int64x4,   _mm256_add_epi64)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_int64x4,   _mm256_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float32x8, _mm256_add_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float32x8, _mm256_sub_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float32x8, _mm256_mul_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float32x8, _mm256_div_ps)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_add, v_float64x4, _mm256_add_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_sub, v_float64x4, _mm256_sub_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_mul, v_float64x4, _mm256_mul_pd)
+OPENCV_HAL_IMPL_AVX_BIN_OP(v_div, v_float64x4, _mm256_div_pd)
 
 // saturating multiply 8-bit, 16-bit
-inline v_uint8x32 operator * (const v_uint8x32& a, const v_uint8x32& b)
+inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
 {
     v_uint16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x32 operator * (const v_int8x32& a, const v_int8x32& b)
+inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
 {
     v_int16x16 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
+inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
 {
     __m256i pl = _mm256_mullo_epi16(a.val, b.val);
     __m256i ph = _mm256_mulhi_epu16(a.val, b.val);
@@ -727,7 +733,7 @@ inline v_uint16x16 operator * (const v_uint16x16& a, const v_uint16x16& b)
     __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
     return v_uint16x16(_v256_packs_epu32(p0, p1));
 }
-inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
+inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
 {
     __m256i pl = _mm256_mullo_epi16(a.val, b.val);
     __m256i ph = _mm256_mulhi_epi16(a.val, b.val);
@@ -735,14 +741,6 @@ inline v_int16x16 operator * (const v_int16x16& a, const v_int16x16& b)
     __m256i p1 = _mm256_unpackhi_epi16(pl, ph);
     return v_int16x16(_mm256_packs_epi32(p0, p1));
 }
-inline v_uint8x32& operator *= (v_uint8x32& a, const v_uint8x32& b)
-{ a = a * b; return a; }
-inline v_int8x32& operator *= (v_int8x32& a, const v_int8x32& b)
-{ a = a * b; return a; }
-inline v_uint16x16& operator *= (v_uint16x16& a, const v_uint16x16& b)
-{ a = a * b; return a; }
-inline v_int16x16& operator *= (v_int16x16& a, const v_int16x16& b)
-{ a = a * b; return a; }
 
 /** Non-saturating arithmetics **/
 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
@@ -833,13 +831,13 @@ inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                   \
     { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                   \
     { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                   \
     { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                   \
     { return _Tpsvec(srai(a.val, imm)); }                             \
     template<int imm>                                                 \
     inline _Tpuvec v_shl(const _Tpuvec& a)                            \
@@ -867,11 +865,11 @@ OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx
 
 
 /** Bitwise logic **/
-#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
-    OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
-    inline _Tpvec operator ~ (const _Tpvec& a)                   \
+#define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)     \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_and, _Tpvec, _mm256_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_or, _Tpvec, _mm256_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX_BIN_OP(v_xor, _Tpvec, _mm256_xor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                            \
     { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
 
 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
@@ -900,29 +898,29 @@ OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
 
 /** Comparison **/
-#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a == b); }                                         \
-    inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
-    { return b > a; }                                             \
-    inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
-    { return ~(a < b); }                                          \
-    inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
-    { return b >= a; }
+#define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                            \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_eq(a, b)); }                                        \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_gt(b, a); }                                               \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_lt(a, b)); }                                        \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_ge(b, a); }
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
-    inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)              \
     { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)              \
     {                                                                    \
         __m256i smask = _mm256_set1_##suffix(sbit);                      \
         return _Tpuvec(_mm256_cmpgt_##suffix(                            \
                        _mm256_xor_si256(a.val, smask),                   \
                        _mm256_xor_si256(b.val, smask)));                 \
     }                                                                    \
-    inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)              \
     { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
-    inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)              \
     { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
@@ -932,25 +930,25 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
-    inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)         \
     { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
-    inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-    { return ~(a == b); }
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)         \
+    { return v_not(v_eq(a, b)); }
 
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 
 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)           \
     { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
 
 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
-    OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, suffix)  \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, suffix)  \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, suffix)   \
+    OPENCV_HAL_IMPL_AVX_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, suffix)
 
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
@@ -1216,9 +1214,9 @@ inline unsigned v_reduce_sum(const v_uint32x8& a)
 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
 
 inline int v_reduce_sum(const v_int16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline unsigned v_reduce_sum(const v_uint16x16& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 inline float v_reduce_sum(const v_float32x8& a)
 {
@@ -1273,27 +1271,27 @@ inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
 inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
 {
     v_uint32x8 l, h;
-    v_expand(v_add_wrap(a - b, b - a), l, h);
-    return v_reduce_sum(l + h);
+    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
 {
     v_uint32x8 l, h;
     v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
 {
-    return v_reduce_sum(v_max(a, b) - v_min(a, b));
+    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
 }
 inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 m = a < b;
-    return v_reduce_sum(v_reinterpret_as_u32(((a - b) ^ m) - m));
+    v_int32x8 m = v_lt(a, b);
+    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
 }
 inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
 {
-    return v_reduce_sum((a - b) & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+    return v_reduce_sum(v_and(v_sub(a, b), v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))));
 }
 
 /** Popcount **/
@@ -1308,15 +1306,15 @@ inline v_uint8x32 v_popcount(const v_uint8x32& a)
 inline v_uint16x16 v_popcount(const v_uint16x16& a)
 {
     v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v256_setall_u16(0x00ff));
 }
 inline v_uint32x8 v_popcount(const v_uint32x8& a)
 {
     v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v256_setall_u32(0x000000ff));
 }
 inline v_uint64x4 v_popcount(const v_uint64x4& a)
 {
@@ -1408,9 +1406,9 @@ OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
     inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
     { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, b * b); }                                            \
+    { return v_fma(a, a, v_mul(b, b)); }                                      \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, b*b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
@@ -1419,7 +1417,7 @@ OPENCV_HAL_IMPL_AVX_MISC(v_float64x4, pd)
 
 inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
@@ -1429,16 +1427,16 @@ inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x
 
 inline v_float32x8 v_invsqrt(const v_float32x8& x)
 {
-    v_float32x8 half = x * v256_setall_f32(0.5);
+    v_float32x8 half = v_mul(x, v256_setall_f32(0.5));
     v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
     // todo: _mm256_fnmsub_ps
-    t *= v256_setall_f32(1.5) - ((t * t) * half);
+    t = v_mul(t, v_sub(v256_setall_f32(1.5), v_mul(v_mul(t, t), half)));
     return t;
 }
 
 inline v_float64x4 v_invsqrt(const v_float64x4& x)
 {
-    return v256_setall_f64(1.) / v_sqrt(x);
+    return v_div(v256_setall_f64(1.), v_sqrt(x));
 }
 
 /** Absolute values **/
@@ -1451,23 +1449,23 @@ OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
 OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
 
 inline v_float32x8 v_abs(const v_float32x8& x)
-{ return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
+{ return v_and(x, v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)))); }
 inline v_float64x4 v_abs(const v_float64x4& x)
-{ return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
+{ return v_and(x, v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1)))); }
 
 /** Absolute difference **/
 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
 {
     v_int8x32 d = v_sub_wrap(a, b);
-    v_int8x32 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x32 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 
 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
@@ -1475,26 +1473,26 @@ inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
 
 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
 {
-    v_int32x8 d = a - b;
-    v_int32x8 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x8 d = v_sub(a, b);
+    v_int32x8 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
 {
-    v_int8x32 d = a - b;
-    v_int8x32 m = a < b;
-    return (d ^ m) - m;
+    v_int8x32 d = v_sub(a, b);
+    v_int8x32 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ////////// Conversions /////////
 
@@ -1789,7 +1787,7 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
@@ -1799,7 +1797,7 @@ inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
     return v_int64x4(_mm256_add_epi64(even, odd));
 }
 inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
@@ -1816,7 +1814,7 @@ inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
     return v_uint32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
 {
@@ -1831,7 +1829,7 @@ inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
     return v_int32x8(_mm256_add_epi32(prod0, prod1));
 }
 inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
@@ -1855,7 +1853,7 @@ inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
     ));
 }
 inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1871,13 +1869,13 @@ inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
     ));
 }
 inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1923,7 +1921,7 @@ inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16&
     return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
 }
 inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
 {
@@ -1934,7 +1932,7 @@ inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
     return v_int64x4(_mm256_add_epi64(lo, hi));
 }
 inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
@@ -1953,7 +1951,7 @@ inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
     v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
     v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
     v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
 }
 
 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
@@ -2058,43 +2056,43 @@ v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
 {
     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
 {
     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 32
@@ -2127,43 +2125,43 @@ v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
 {
     // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
 {
     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
 {
     v_int32x8 delta = v256_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 64
@@ -2192,28 +2190,28 @@ template<int n> inline
 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
 {
     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
 {
     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
 {
     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
 {
     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // pack boolean
@@ -3137,7 +3135,7 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, u
 // FP16
 //
 
-inline v_float32x8 v256_load_expand(const float16_t* ptr)
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
@@ -3149,7 +3147,7 @@ inline v_float32x8 v256_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
 {
 #if CV_FP16
     __m128i ah = _mm256_cvtps_ph(a.val, 0);
@@ -3158,7 +3156,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
     float CV_DECL_ALIGNED(32) buf[8];
     v_store_aligned(buf, a);
     for (int i = 0; i < 8; i++)
-        ptr[i] = float16_t(buf[i]);
+        ptr[i] = hfloat(buf[i]);
 #endif
 }
 
@@ -3168,6 +3166,20 @@ inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
 
 inline void v256_cleanup() { _mm256_zeroall(); }
 
+#include "intrin_math.hpp"
+inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
+inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
+inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
+
+inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
+inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
+inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_avx512.hpp b/3rdParty/opencv2/core/hal/intrin_avx512.hpp
index 226187587e..9d6eee2dde 100644
--- a/3rdParty/opencv2/core/hal/intrin_avx512.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_avx512.hpp
@@ -458,6 +458,10 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
     { return _Tpvec(_mm512_setzero_si512()); }                                     \
     inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
     { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    template <> inline _Tpvec v_setzero_()                                         \
+    { return v512_setzero_##suffix(); }                                            \
+    template <> inline _Tpvec v_setall_(_Tp v)                                     \
+    { return v512_setall_##suffix(v); }                                            \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
@@ -483,6 +487,10 @@ OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
     { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
     inline _Tpvec v512_setall_##suffix(_Tp v)                               \
     { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    template <> inline _Tpvec v_setzero_()                                  \
+    { return v512_setzero_##suffix(); }                                     \
+    template <> inline _Tpvec v_setall_(_Tp v)                              \
+    { return v512_setall_##suffix(v); }                                     \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
     OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
@@ -506,12 +514,12 @@ inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
 { return v_float64x8(_mm512_castps_pd(a.val)); }
 
 // FP16
-inline v_float32x16 v512_load_expand(const float16_t* ptr)
+inline v_float32x16 v512_load_expand(const hfloat* ptr)
 {
     return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+inline void v_pack_store(hfloat* ptr, const v_float32x16& a)
 {
     __m256i ah = _mm512_cvtps_ph(a.val, 0);
     _mm256_storeu_si256((__m256i*)ptr, ah);
@@ -663,58 +671,56 @@ inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
 }
 
 #define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
-    { return _Tpvec(intrin(a.val, b.val)); }                             \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
-    { a.val = intrin(a.val, b.val); return a; }
-
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
-
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_int64x8, _mm512_mullo_epi64)
 
 /** Saturating arithmetics **/
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
-
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
-OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_add, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_sub, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_mul, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(v_div, v_float64x8, _mm512_div_pd)
 
 // saturating multiply
-inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+inline v_uint8x64 v_mul(const v_uint8x64& a, const v_uint8x64& b)
 {
     v_uint16x32 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+inline v_int8x64 v_mul(const v_int8x64& a, const v_int8x64& b)
 {
     v_int16x32 c, d;
     v_mul_expand(a, b, c, d);
     return v_pack(c, d);
 }
-inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+inline v_uint16x32 v_mul(const v_uint16x32& a, const v_uint16x32& b)
 {
     __m512i pl = _mm512_mullo_epi16(a.val, b.val);
     __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
@@ -724,7 +730,7 @@ inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
     const __m512i m = _mm512_set1_epi32(65535);
     return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
 }
-inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+inline v_int16x32 v_mul(const v_int16x32& a, const v_int16x32& b)
 {
     __m512i pl = _mm512_mullo_epi16(a.val, b.val);
     __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
@@ -733,15 +739,6 @@ inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
     return v_int16x32(_mm512_packs_epi32(p0, p1));
 }
 
-inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
-{ a = a * b; return a; }
-inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
-{ a = a * b; return a; }
-inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
-{ a = a * b; return a; }
-inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
-{ a = a * b; return a; }
-
 inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
 inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
 
@@ -802,13 +799,13 @@ inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
-    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)               \
     { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)               \
     { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
-    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)               \
     { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
-    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)               \
     { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
     template<int imm>                                             \
     inline _Tpuvec v_shl(const _Tpuvec& a)                        \
@@ -830,10 +827,10 @@ OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
-    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
-    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_and, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_or, _Tpvec, _mm512_or_##suffix)    \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(v_xor, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                               \
     { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
 
 OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
@@ -865,16 +862,16 @@ OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                         \
     { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_eq, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ne, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_lt,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_gt,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_le, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(v_ge, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
 
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
@@ -886,16 +883,16 @@ OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                        \
     { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
 
 #define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
-    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_eq, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ne, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_lt,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_gt,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_le, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval)  \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(v_ge, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
 
 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
 OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
@@ -1250,9 +1247,9 @@ OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
 OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
 
 inline int v_reduce_sum(const v_int16x32& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline uint v_reduce_sum(const v_uint16x32& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 #define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
     inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
@@ -1306,17 +1303,17 @@ inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
     return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
 }
 inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+{ return v_reduce_sum(v_add_wrap(v_sub(a, b), v_sub(b, a))); }
 inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
 { return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
 inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+{ return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b))); }
 inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
-{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+{ return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b)))); }
 inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
-{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+{ return v_reduce_sum(v_and(v_sub(a, b), v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))))); }
 inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
-{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+{ return v_reduce_sum(v_and(v_sub(a, b), v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff))))); }
 
 /** Popcount **/
 inline v_uint8x64 v_popcount(const v_int8x64& a)
@@ -1351,8 +1348,8 @@ inline v_uint16x32 v_popcount(const v_int16x32& a)
                                           _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
 #else
     v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v512_setall_u16(0x00ff));
 #endif
 }
 inline v_uint32x16 v_popcount(const v_int32x16& a)
@@ -1361,9 +1358,9 @@ inline v_uint32x16 v_popcount(const v_int32x16& a)
     return v_uint32x16(_mm512_popcnt_epi32(a.val));
 #else
     v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v512_setall_u32(0x000000ff));
 #endif
 }
 inline v_uint64x8 v_popcount(const v_int64x8& a)
@@ -1403,9 +1400,9 @@ inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinte
     inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
     { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
-    { return v_fma(a, a, b * b); }                                            \
+    { return v_fma(a, a, v_mul(b, b)); }                                      \
     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
-    { return v_sqrt(v_fma(a, a, b * b)); }
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
 
 OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
@@ -1413,7 +1410,7 @@ OPENCV_HAL_IMPL_AVX512_MISC(v_float32x16, ps)
 OPENCV_HAL_IMPL_AVX512_MISC(v_float64x8,  pd)
 
 inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
-{ return a * b + c; }
+{ return v_add(v_mul(a, b), c); }
 inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
 { return v_fma(a, b, c); }
 
@@ -1422,9 +1419,9 @@ inline v_float32x16 v_invsqrt(const v_float32x16& x)
 #if CV_AVX_512ER
     return v_float32x16(_mm512_rsqrt28_ps(x.val));
 #else
-    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 half = v_mul(x, v512_setall_f32(0.5));
     v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
-    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    t = v_mul(t, v_sub(v512_setall_f32(1.5), v_mul(v_mul(t, t), half)));
     return t;
 #endif
 }
@@ -1434,7 +1431,7 @@ inline v_float64x8 v_invsqrt(const v_float64x8& x)
 #if CV_AVX_512ER
     return v_float64x8(_mm512_rsqrt28_pd(x.val));
 #else
-    return v512_setall_f64(1.) / v_sqrt(x);
+    return v_div(v512_setall_f64(1.), v_sqrt(x));
 //    v_float64x8 half = x * v512_setall_f64(0.5);
 //    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
 //    t *= v512_setall_f64(1.5) - ((t * t) * half);
@@ -1482,17 +1479,17 @@ inline v_float64x8 v_abs(const v_float64x8& x)
 
 /** Absolute difference **/
 inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
 {
     v_int8x64 d = v_sub_wrap(a, b);
-    v_int8x64 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x64 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 
 inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
@@ -1500,26 +1497,26 @@ inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
 
 inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
 {
-    v_int32x16 d = a - b;
-    v_int32x16 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x16 d = v_sub(a, b);
+    v_int32x16 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Saturating absolute difference **/
 inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
 {
-    v_int8x64 d = a - b;
-    v_int8x64 m = a < b;
-    return (d ^ m) - m;
+    v_int8x64 d = v_sub(a, b);
+    v_int8x64 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
 }
 inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 ////////// Conversions /////////
 
@@ -1818,7 +1815,7 @@ inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
 { return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
 inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
@@ -1828,7 +1825,7 @@ inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
     return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
@@ -1844,7 +1841,7 @@ inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
     return v_uint32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
 {
@@ -1859,7 +1856,7 @@ inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
     return v_int32x16(_mm512_add_epi32(prod0, prod1));
 }
 inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
@@ -1883,7 +1880,7 @@ inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
     ));
 }
 inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
 {
@@ -1893,13 +1890,13 @@ inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
     return v_int64x8(_mm512_add_epi64(even, odd));
 }
 inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1944,7 +1941,7 @@ inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32&
     return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
 }
 inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
 { return v_dotprod_expand(a, b); }
@@ -1955,7 +1952,7 @@ inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b,
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
 { return v_dotprod_expand(a, b); }
 inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 
 #define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
@@ -1969,7 +1966,7 @@ inline v_float32x16 v_matmul(const v_float32x16& v,
     v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
     v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
     v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
-    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
 }
 
 inline v_float32x16 v_matmuladd(const v_float32x16& v,
@@ -2070,43 +2067,43 @@ v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
 {
     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
     v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
-                    v_reinterpret_as_s16((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s16(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s16(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
 {
     v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s16(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
 {
     v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 32
@@ -2139,43 +2136,43 @@ template<int n> inline
 v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
 {
     v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
-                    v_reinterpret_as_s32((b + delta) >> n));
+    return v_pack_u(v_reinterpret_as_s32(v_shr(v_add(a, delta), n)),
+                    v_reinterpret_as_s32(v_shr(v_add(b, delta), n)));
 }
 
 template<int n> inline
 void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
 {
     v_uint32x16 delta = v512_setall_u32(1 << (n-1));
-    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+    v_pack_u_store(ptr, v_reinterpret_as_s32(v_shr(v_add(a, delta), n)));
 }
 
 template<int n> inline
 v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+    return v_pack_u(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_u_store(ptr, (a + delta) >> n);
+    v_pack_u_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(short* ptr, const v_int32x16& a)
 {
     v_int32x16 delta = v512_setall_s32(1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // 64
@@ -2196,28 +2193,28 @@ template<int n> inline
 v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
 {
     v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
 {
     v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 template<int n> inline
 v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
 {
     v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    return v_pack((a + delta) >> n, (b + delta) >> n);
+    return v_pack(v_shr(v_add(a, delta), n), v_shr(v_add(b, delta), n));
 }
 
 template<int n> inline
 void v_rshr_pack_store(int* ptr, const v_int64x8& a)
 {
     v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
-    v_pack_store(ptr, (a + delta) >> n);
+    v_pack_store(ptr, v_shr(v_add(a, delta), n));
 }
 
 // pack boolean
@@ -3081,6 +3078,20 @@ inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signm
 
 inline void v512_cleanup() { _mm256_zeroall(); }
 
+#include "intrin_math.hpp"
+inline v_float32x16 v_exp(const v_float32x16& x) { return v_exp_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_log(const v_float32x16& x) { return v_log_default_32f<v_float32x16, v_int32x16>(x); }
+inline void v_sincos(const v_float32x16& x, v_float32x16& s, v_float32x16& c) { v_sincos_default_32f<v_float32x16, v_int32x16>(x, s, c); }
+inline v_float32x16 v_sin(const v_float32x16& x) { return v_sin_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_cos(const v_float32x16& x) { return v_cos_default_32f<v_float32x16, v_int32x16>(x); }
+inline v_float32x16 v_erf(const v_float32x16& x) { return v_erf_default_32f<v_float32x16, v_int32x16>(x); }
+
+inline v_float64x8 v_exp(const v_float64x8& x) { return v_exp_default_64f<v_float64x8, v_int64x8>(x); }
+inline v_float64x8 v_log(const v_float64x8& x) { return v_log_default_64f<v_float64x8, v_int64x8>(x); }
+inline void v_sincos(const v_float64x8& x, v_float64x8& s, v_float64x8& c) { v_sincos_default_64f<v_float64x8, v_int64x8>(x, s, c); }
+inline v_float64x8 v_sin(const v_float64x8& x) { return v_sin_default_64f<v_float64x8, v_int64x8>(x); }
+inline v_float64x8 v_cos(const v_float64x8& x) { return v_cos_default_64f<v_float64x8, v_int64x8>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_cpp.hpp b/3rdParty/opencv2/core/hal/intrin_cpp.hpp
index 3e0cd66fd6..1b2462c642 100644
--- a/3rdParty/opencv2/core/hal/intrin_cpp.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_cpp.hpp
@@ -48,6 +48,7 @@
 #include <limits>
 #include <cstring>
 #include <algorithm>
+#include "opencv2/core/utility.hpp"
 #include "opencv2/core/saturate.hpp"
 
 //! @cond IGNORED
@@ -224,32 +225,32 @@ These operations allow to reorder or recombine elements in one or multiple vecto
 Element-wise binary and unary operations.
 
 - Arithmetics:
-@ref operator +(const v_reg &a, const v_reg &b) "+",
-@ref operator -(const v_reg &a, const v_reg &b) "-",
-@ref operator *(const v_reg &a, const v_reg &b) "*",
-@ref operator /(const v_reg &a, const v_reg &b) "/",
+@ref v_add(const v_reg &a, const v_reg &b) "+",
+@ref v_sub(const v_reg &a, const v_reg &b) "-",
+@ref v_mul(const v_reg &a, const v_reg &b) "*",
+@ref v_div(const v_reg &a, const v_reg &b) "/",
 @ref v_mul_expand
 
 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
 
 - Bitwise shifts:
-@ref operator <<(const v_reg &a, int s) "<<",
-@ref operator >>(const v_reg &a, int s) ">>",
+@ref v_shl(const v_reg &a, int s) "<<",
+@ref v_shr(const v_reg &a, int s) ">>",
 @ref v_shl, @ref v_shr
 
 - Bitwise logic:
-@ref operator &(const v_reg &a, const v_reg &b) "&",
-@ref operator |(const v_reg &a, const v_reg &b) "|",
-@ref operator ^(const v_reg &a, const v_reg &b) "^",
-@ref operator ~(const v_reg &a) "~"
+@ref v_and(const v_reg &a, const v_reg &b) "&",
+@ref v_or(const v_reg &a, const v_reg &b) "|",
+@ref v_xor(const v_reg &a, const v_reg &b) "^",
+@ref v_not(const v_reg &a) "~"
 
 - Comparison:
-@ref operator >(const v_reg &a, const v_reg &b) ">",
-@ref operator >=(const v_reg &a, const v_reg &b) ">=",
-@ref operator <(const v_reg &a, const v_reg &b) "<",
-@ref operator <=(const v_reg &a, const v_reg &b) "<=",
-@ref operator ==(const v_reg &a, const v_reg &b) "==",
-@ref operator !=(const v_reg &a, const v_reg &b) "!="
+@ref v_gt(const v_reg &a, const v_reg &b) ">",
+@ref v_ge(const v_reg &a, const v_reg &b) ">=",
+@ref v_lt(const v_reg &a, const v_reg &b) "<",
+@ref v_le(const v_reg &a, const v_reg &b) "<=",
+@ref v_eq(const v_reg &a, const v_reg &b) "==",
+@ref v_ne(const v_reg &a, const v_reg &b) "!="
 
 - min/max: @ref v_min, @ref v_max
 
@@ -262,7 +263,8 @@ Most of these operations return only one value.
 
 ### Other math
 
-- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
+- Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude, @ref v_exp, @ref v_log,
+                            @ref v_erf, @ref v_sin, @ref v_cos
 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
 
 ### Conversions
@@ -362,6 +364,9 @@ Floating point:
 |reverse            | x | x |
 |extract_n          | x | x |
 |broadcast_element  | x |   |
+|exp                | x | x |
+|log                | x | x |
+|sin, cos           | x | x |
 
  @{ */
 
@@ -569,50 +574,43 @@ enum {
 /** @brief Add values
 
 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_add(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Subtract values
 
 For all types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_sub(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Multiply values
 
 For 16- and 32-bit integer types and floating types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_mul(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Divide values
 
 For floating types only. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_div(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 
 /** @brief Bitwise AND
 
 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_and(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise OR
 
 Only for integer types. */
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_or(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise XOR
 
 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_xor(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
 
 /** @brief Bitwise NOT
 
 Only for integer types.*/
-template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
+template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a);
 
 
 #ifndef CV_DOXYGEN
@@ -635,33 +633,26 @@ __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
 
-#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
+#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op, func) \
 template<int n> inline \
-v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
     return c; \
-} \
-template<int n> inline \
-v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
-    return a; \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
+#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op, func) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op, func)
 
-CV__HAL_INTRIN_IMPL_BIN_OP(+)
-CV__HAL_INTRIN_IMPL_BIN_OP(-)
-CV__HAL_INTRIN_IMPL_BIN_OP(*)
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
+CV__HAL_INTRIN_IMPL_BIN_OP(+, v_add)
+CV__HAL_INTRIN_IMPL_BIN_OP(-, v_sub)
+CV__HAL_INTRIN_IMPL_BIN_OP(*, v_mul)
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /, v_div)
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
+#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op, func) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     v_reg<_Tp, n> c; \
     typedef typename V_TypeTraits<_Tp>::int_type itype; \
@@ -669,29 +660,20 @@ v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
     return c; \
-} \
-template<int n> CV_INLINE \
-v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
-{ \
-    typedef typename V_TypeTraits<_Tp>::int_type itype; \
-    for( int i = 0; i < n; i++ ) \
-        a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
-                                                        V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
-    return a; \
 }
 
-#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
-CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
+#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op, func) \
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) \
+CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op, func) /* TODO: FIXIT remove this after masks refactoring */
 
 
-CV__HAL_INTRIN_IMPL_BIT_OP(&)
-CV__HAL_INTRIN_IMPL_BIT_OP(|)
-CV__HAL_INTRIN_IMPL_BIT_OP(^)
+CV__HAL_INTRIN_IMPL_BIT_OP(&, v_and)
+CV__HAL_INTRIN_IMPL_BIT_OP(|, v_or)
+CV__HAL_INTRIN_IMPL_BIT_OP(^, v_xor)
 
-#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
+#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy, dummy2) \
 template<int n> CV_INLINE \
-v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
+v_reg<_Tp, n> v_not(const v_reg<_Tp, n>& a) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
@@ -699,7 +681,7 @@ v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
     return c; \
 } \
 
-CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
+CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~, v_not)
 
 #endif  // !CV_DOXYGEN
 
@@ -720,12 +702,85 @@ template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a)
 Only for floating point types.*/
 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
 
-//! @cond IGNORED
-OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
-OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
+/**
+ * @brief Exponential \f$ e^x \f$ of elements
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Decompose Input: Convert the input to \f$ 2^{x \cdot \log_2e} \f$ and split its exponential into integer and fractional parts:
+ *    \f$ x \cdot \log_2e = n + f \f$, where \f$ n \f$ is the integer part and \f$ f \f$ is the fractional part.
+ * 2. Compute \f$ 2^n \f$: Calculated by shifting the bits.
+ * 3. Adjust Fractional Part: Compute \f$ f \cdot \ln2 \f$ to convert the fractional part to base \f$ e \f$.
+ *    \f$ C1 \f$ and \f$ C2 \f$ are used to adjust the fractional part.
+ * 4. Polynomial Approximation for \f$ e^{f \cdot \ln2} \f$: The closer the fractional part is to 0, the more accurate the result.
+ *    - For float16 and float32, use a Taylor Series with 6 terms.
+ *    - For float64, use Pade Polynomials Approximation with 4 terms.
+ * 5. Combine Results: Multiply the two parts together to get the final result:
+ *    \f$ e^x = 2^n \cdot e^{f \cdot \ln2} \f$.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input vector.
+ */
 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
+#define OPENCV_HAL_MATH_HAVE_EXP 1
+
+/**
+ * @brief Natural logarithm \f$ \log(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Decompose Input: Use binary representation to decompose the input into mantissa part \f$ m \f$ and exponent part \f$ e \f$. Such that \f$ \log(x) = \log(m \cdot 2^e) = \log(m) + e \cdot \ln(2) \f$.
+ * 2. Adjust Mantissa and Exponent Parts: If the mantissa is less than \f$ \sqrt{0.5} \f$, adjust the exponent and mantissa to ensure the mantissa is in the range \f$ (\sqrt{0.5}, \sqrt{2}) \f$ for better approximation.
+ * 3. Polynomial Approximation for \f$ \log(m) \f$: The closer the \f$ m \f$ is to 1, the more accurate the result.
+ *    - For float16 and float32, use a Taylor Series with 9 terms.
+ *    - For float64, use Pade Polynomials Approximation with 6 terms.
+ * 4. Combine Results: Add the two parts together to get the final result.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input.
+ *
+ * @note Similar to the behavior of std::log(), \f$ \ln(0) = -\infty \f$.
+ */
 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
-//! @endcond
+
+/**
+ * @brief Error function.
+ *
+ * @note Support FP32 precision for now.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_erf, std::erf, _Tp)
+
+/**
+ * @brief Compute sine \f$ sin(x) \f$ and cosine \f$ cos(x) \f$ of elements at the same time
+ *
+ * Only for floating point types. Core implementation steps:
+ * 1. Input Normalization: Scale the periodicity from 2π to 4 and reduce the angle to the range \f$ [0, \frac{\pi}{4}] \f$ using periodicity and trigonometric identities.
+ * 2. Polynomial Approximation for \f$ sin(x) \f$ and \f$ cos(x) \f$:
+ *   - For float16 and float32, use a Taylor series with 4 terms for sine and 5 terms for cosine.
+ *   - For float64, use a Taylor series with 7 terms for sine and 8 terms for cosine.
+ * 3. Select Results: select and convert the final sine and cosine values for the original input angle.
+ *
+ * @note The precision of the calculation depends on the implementation and the data type of the input vector.
+ */
+template<typename _Tp, int n>
+inline void v_sincos(const v_reg<_Tp, n>& x, v_reg<_Tp, n>& s, v_reg<_Tp, n>& c)
+{
+    for( int i = 0; i < n; i++ )
+    {
+        s.s[i] = std::sin(x.s[i]);
+        c.s[i] = std::cos(x.s[i]);
+    }
+}
+
+/**
+ * @brief Sine \f$ sin(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation the same as @ref v_sincos.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
+
+/**
+ * @brief Cosine \f$ cos(x) \f$ of elements
+ *
+ * Only for floating point types. Core implementation the same as @ref v_sincos.
+ */
+OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
 
 /** @brief Absolute value of elements
 
@@ -848,9 +903,9 @@ inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
+#define OPENCV_HAL_IMPL_CMP_OP(cmp_op, func) \
 template<typename _Tp, int n> \
-inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
 { \
     typedef typename V_TypeTraits<_Tp>::int_type itype; \
     v_reg<_Tp, n> c; \
@@ -862,32 +917,28 @@ inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>
 /** @brief Less-than comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<)
+OPENCV_HAL_IMPL_CMP_OP(<, v_lt)
 
 /** @brief Greater-than comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>)
+OPENCV_HAL_IMPL_CMP_OP(>, v_gt)
 
 /** @brief Less-than or equal comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(<=)
+OPENCV_HAL_IMPL_CMP_OP(<=, v_le)
 
 /** @brief Greater-than or equal comparison
 
 For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(>=)
+OPENCV_HAL_IMPL_CMP_OP(>=, v_ge)
 
-/** @brief Equal comparison
+/** @brief Equal comparison */
+OPENCV_HAL_IMPL_CMP_OP(==, v_eq)
 
-For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(==)
-
-/** @brief Not equal comparison
-
-For all types except 64-bit integer values. */
-OPENCV_HAL_IMPL_CMP_OP(!=)
+/** @brief Not equal comparison */
+OPENCV_HAL_IMPL_CMP_OP(!=, v_ne)
 
 template<int n>
 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
@@ -1256,8 +1307,8 @@ template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
-#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
-template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
+#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op, func) \
+template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, int imm) \
 { \
     v_reg<_Tp, n> c; \
     for( int i = 0; i < n; i++ ) \
@@ -1268,12 +1319,12 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
 /** @brief Bitwise shift left
 
 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(<< )
+OPENCV_HAL_IMPL_SHIFT_OP(<<, v_shl)
 
 /** @brief Bitwise shift right
 
 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(>> )
+OPENCV_HAL_IMPL_SHIFT_OP(>>, v_shr)
 
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
@@ -2782,7 +2833,8 @@ inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
-inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
+inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); } \
+template <> inline _Tpvec v_setzero_() { return _Tpvec::zero(); }
 
 //! @name Init with zero
 //! @{
@@ -2828,7 +2880,8 @@ OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
-inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
+inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
+template <> inline _Tpvec v_setall_(_Tp val) { return _Tpvec::all(val); }
 
 //! @name Init with value
 //! @{
@@ -2897,7 +2950,7 @@ OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
-{ return a << shift; }
+{ return v_shl(a, shift); }
 
 //! @name Left shift
 //! @{
@@ -2914,7 +2967,7 @@ OPENCV_HAL_IMPL_C_SHIFTL(int64)
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
-{ return a >> shift; }
+{ return v_shr(a, shift); }
 
 //! @name Right shift
 //! @{
@@ -3240,7 +3293,7 @@ inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
 
 
 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
                                                            const v_reg<double, n/2>& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
@@ -3254,7 +3307,7 @@ template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int,
 ////// FP16 support ///////
 
 inline v_reg<float, simd128_width / sizeof(float)>
-v_load_expand(const float16_t* ptr)
+v_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd128_width / sizeof(float)> v;
     for( int i = 0; i < v.nlanes; i++ )
@@ -3265,7 +3318,7 @@ v_load_expand(const float16_t* ptr)
 }
 #if CV_SIMD256
 inline v_reg<float, simd256_width / sizeof(float)>
-v256_load_expand(const float16_t* ptr)
+v256_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd256_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3277,7 +3330,7 @@ v256_load_expand(const float16_t* ptr)
 #endif
 #if CV_SIMD512
 inline v_reg<float, simd512_width / sizeof(float)>
-v512_load_expand(const float16_t* ptr)
+v512_load_expand(const hfloat* ptr)
 {
     v_reg<float, simd512_width / sizeof(float)> v;
     for (int i = 0; i < v.nlanes; i++)
@@ -3289,11 +3342,11 @@ v512_load_expand(const float16_t* ptr)
 #endif
 
 template<int n> inline void
-v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
+v_pack_store(hfloat* ptr, const v_reg<float, n>& v)
 {
     for( int i = 0; i < v.nlanes; i++ )
     {
-        ptr[i] = float16_t(v.s[i]);
+        ptr[i] = hfloat(v.s[i]);
     }
 }
 
diff --git a/3rdParty/opencv2/core/hal/intrin_forward.hpp b/3rdParty/opencv2/core/hal/intrin_forward.hpp
index 7c5066f4da..524574c6d8 100644
--- a/3rdParty/opencv2/core/hal/intrin_forward.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_forward.hpp
@@ -188,4 +188,4 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
 
-} // cv::
\ No newline at end of file
+} // cv::
diff --git a/3rdParty/opencv2/core/hal/intrin_lasx.hpp b/3rdParty/opencv2/core/hal/intrin_lasx.hpp
new file mode 100644
index 0000000000..3661b7ef32
--- /dev/null
+++ b/3rdParty/opencv2/core/hal/intrin_lasx.hpp
@@ -0,0 +1,3036 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LASX_HPP
+#define OPENCV_HAL_INTRIN_LASX_HPP
+
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#define CV_SIMD256 1
+#define CV_SIMD256_64F 1
+#define CV_SIMD256_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Utils ////////////
+
+inline __m256i _v256_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                    char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                    char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                    char v30, char v31)
+{
+    return (__m256i)v32i8{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                           v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+                           v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+                           v30, v31 };
+}
+
+inline __m256i _v256_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6, char v7, char v8,  char v9,
+                   char v10, char v11, char v12, char v13, char v14, char v15, char v16, char v17, char v18, char v19,
+                   char v20, char v21, char v22, char v23, char v24, char v25, char v26, char v27, char v28, char v29,
+                   char v30, char v31)
+{
+    return (__m256i)v32i8{ v31, v30,
+                           v29, v28, v27, v26, v25, v24, v23, v22, v21, v20,
+                           v19, v18, v17, v16, v15, v14, v13, v12, v11, v10,
+                           v9, v8, v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setr_h(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7,
+                            short v8,  short v9, short v10, short v11, short v12, short v13, short v14, short v15)
+{
+    return (__m256i)v16i16{ v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m256i _v256_setr_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256i _v256_set_w(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
+{
+    return (__m256i)v8i32{ v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m256i _v256_setall_w(int v0)
+{
+    return (__m256i)v8i32{ v0, v0, v0, v0, v0, v0, v0, v0 };
+}
+
+inline __m256i _v256_setr_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v0, v1, v2, v3 };
+}
+
+inline __m256i _v256_set_d(int64 v0, int64 v1, int64 v2, int64 v3)
+{
+    return (__m256i)v4i64{ v3, v2, v1, v0 };
+}
+
+inline __m256 _v256_setr_ps(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7)
+{
+    return (__m256)v8f32{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m256 _v256_setall_ps(float f32)
+{
+    return (__m256)v8f32{ f32, f32, f32, f32, f32, f32, f32, f32 };
+}
+
+inline __m256d _v256_setr_pd(double v0, double v1, double v2, double v3)
+{
+    return (__m256d)v4f64{ v0, v1, v2, v3 };
+}
+
+inline __m256d _v256_setall_pd(double f64)
+{
+    return (__m256d)v4f64{ f64, f64, f64, f64 };
+}
+
+inline __m256i _lasx_packus_h(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_bu_h(b, a, 0);
+}
+
+inline __m256i _lasx_packs_h(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_b_h(b, a, 0);
+}
+
+inline __m256i _lasx_packus_w(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_hu_w(b, a, 0);
+}
+
+inline __m256i _lasx_packs_w(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrarni_h_w(b, a, 0);
+}
+
+inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
+{ return __lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02); }
+
+inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&lo), *((__m256i*)&hi), 0x02)); }
+
+inline __m256i _v256_shuffle_odd_64(const __m256i& v)
+{ return __lasx_xvpermi_d(v, 0xd8); }
+
+inline __m256d _v256_shuffle_odd_64(const __m256d& v)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&v), 0xd8)); }
+
+//LASX: only use for permute WITHOUT zero clearing
+template<int imm>
+inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
+{ return __lasx_xvpermi_q(a, b, imm); }
+
+template<int imm>
+inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
+{ return __m256(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm>
+inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
+{ return __m256d(__lasx_xvpermi_q(*((__m256i*)&a), *((__m256i*)&b), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
+
+template<int imm>
+inline __m256i _v256_permute4x64(const __m256i& a)
+{ return __lasx_xvpermi_d(a, imm); }
+
+template<int imm>
+inline __m256d _v256_permute4x64(const __m256d& a)
+{ return __m256d(__lasx_xvpermi_d(*((__m256i*)&a), imm)); }
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v256_permute4x64(const _Tpvec& a)
+{ return _Tpvec(_v256_permute4x64<imm>(a.val)); }
+
+inline __m128i _v256_extract_high(const __m256i& v)
+{ __m256i temp256i = __lasx_xvpermi_d(v, 0x4E);
+  return *((__m128i*)&temp256i); }
+
+inline __m128  _v256_extract_high(const __m256& v)
+{ return __m128(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_high(const __m256d& v)
+{ return __m128d(_v256_extract_high(*((__m256i*)&v))); }
+
+inline __m128i _v256_extract_low(const __m256i& v)
+{ return *((__m128i*)&v); }
+
+inline __m128  _v256_extract_low(const __m256& v)
+{ return __m128(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m128d _v256_extract_low(const __m256d& v)
+{ return __m128d(_v256_extract_low(*((__m256i*)&v))); }
+
+inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
+{
+    return __lasx_xvssrlrni_hu_w(b, a, 0);
+}
+
+template<int i>
+inline int _v256_extract_b(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_b(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_h(const __m256i& a)
+{
+    int des[1] = {0};
+    __lasx_xvstelm_h(a, des, 0, i);
+    return des[0];
+}
+
+template<int i>
+inline int _v256_extract_w(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_w(a, i);
+}
+
+template<int i>
+inline int64 _v256_extract_d(const __m256i& a)
+{
+    return __lasx_xvpickve2gr_d(a, i);
+}
+
+///////// Types ////////////
+
+struct v_uint8x32
+{
+    typedef uchar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_uint8x32(__m256i v) : val(v) {}
+    v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31)
+    {
+        val = _v256_setr_b((char)v0, (char)v1, (char)v2, (char)v3,
+            (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
+            (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
+            (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
+            (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
+            (char)v28, (char)v29, (char)v30, (char)v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint8x32() {}
+
+    uchar get0() const {
+        uchar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int8x32
+{
+    typedef schar lane_type;
+    enum { nlanes = 32 };
+    __m256i val;
+
+    explicit v_int8x32(__m256i v) : val(v) {}
+    v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31)
+    {
+        val = _v256_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+            v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
+            v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int8x32() {}
+
+    schar get0() const {
+        schar des[1] = {0};
+        __lasx_xvstelm_b(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint16x16
+{
+    typedef ushort lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_uint16x16(__m256i v) : val(v) {}
+    v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15)
+    {
+        val = _v256_setr_h((short)v0, (short)v1, (short)v2, (short)v3,
+            (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
+            (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint16x16() {}
+
+    ushort get0() const {
+        ushort des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_int16x16
+{
+    typedef short lane_type;
+    enum { nlanes = 16 };
+    __m256i val;
+
+    explicit v_int16x16(__m256i v) : val(v) {}
+    v_int16x16(short v0,  short v1,  short v2,  short v3,
+               short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11,
+               short v12, short v13, short v14, short v15)
+    {
+        val = _v256_setr_h(v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int16x16() {}
+
+    short get0() const {
+        short des[1] = {0};
+        __lasx_xvstelm_h(val, des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint32x8
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_uint32x8(__m256i v) : val(v) {}
+    v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
+               unsigned v4, unsigned v5, unsigned v6, unsigned v7)
+    {
+        val = _v256_setr_w((unsigned)v0, (unsigned)v1, (unsigned)v2,
+            (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint32x8() {}
+
+    unsigned get0() const { return __lasx_xvpickve2gr_wu(val, 0); }
+};
+
+struct v_int32x8
+{
+    typedef int lane_type;
+    enum { nlanes = 8 };
+    __m256i val;
+
+    explicit v_int32x8(__m256i v) : val(v) {}
+    v_int32x8(int v0, int v1, int v2, int v3,
+              int v4, int v5, int v6, int v7)
+    {
+        val = _v256_setr_w(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int32x8() {}
+
+    int get0() const { return __lasx_xvpickve2gr_w(val, 0); }
+};
+
+struct v_float32x8
+{
+    typedef float lane_type;
+    enum { nlanes = 8 };
+    __m256 val;
+
+    explicit v_float32x8(__m256 v) : val(v) {}
+    explicit v_float32x8(__m256i v) { val = *((__m256*)&v); }
+    v_float32x8(float v0, float v1, float v2, float v3,
+                float v4, float v5, float v6, float v7)
+    {
+        val = _v256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float32x8() {}
+
+    float get0() const {
+        float des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int get0toint() const {
+        int des[1] = {0};
+        __lasx_xvstelm_w(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+struct v_uint64x4
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_uint64x4(__m256i v) : val(v) {}
+    v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
+    { val = _v256_setr_d((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_uint64x4() {}
+
+    uint64 get0() const
+    {
+        return __lasx_xvpickve2gr_du(val, 0);
+    }
+};
+
+struct v_int64x4
+{
+    typedef int64 lane_type;
+    enum { nlanes = 4 };
+    __m256i val;
+
+    explicit v_int64x4(__m256i v) : val(v) {}
+    v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
+    { val = _v256_setr_d(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_int64x4() {}
+
+    int64 get0() const
+    {
+        return __lasx_xvpickve2gr_d(val, 0);
+    }
+};
+
+struct v_float64x4
+{
+    typedef double lane_type;
+    enum { nlanes = 4 };
+    __m256d val;
+
+    explicit v_float64x4(__m256d v) : val(v) {}
+    explicit v_float64x4(__m256i v) { val = *((__m256d*)&v); }
+    v_float64x4(double v0, double v1, double v2, double v3)
+    { val = _v256_setr_pd(v0, v1, v2, v3); }
+    /* coverity[uninit_ctor]: suppress warning */
+    v_float64x4() {}
+
+    double get0() const {
+        double des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+
+    int64 get0toint64() const {
+        int64 des[1] = {0};
+        __lasx_xvstelm_d(*((__m256i*)&val), des, 0, 0);
+        return des[0];
+    }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE(_Tpvec, _Tp)                   \
+    inline _Tpvec v256_load(const _Tp* ptr)                           \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                           \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m128i v128 = __lsx_vld(ptr, 0);                             \
+        return _Tpvec(*((__m256i*)&v128));                            \
+    }                                                                 \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m128i vlo = __lsx_vld(ptr0, 0);                             \
+        __m128i vhi = __lsx_vld(ptr1, 0);                             \
+        return _Tpvec(_v256_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { __lasx_xvst(a.val, ptr, 0); }                                   \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                  \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint8x32,  uchar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int8x32,   schar)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint16x16, ushort)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int16x16,  short)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint32x8,  unsigned)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int32x8,   int)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_uint64x4,  uint64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE(v_int64x4,   int64)
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)          \
+    inline _Tpvec v256_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lasx_xvld(ptr, 0)); }                               \
+    inline _Tpvec v256_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        __m128i v128 = __lsx_vld(ptr, 0);                                 \
+        return _Tpvec(*((__m256i*)&v128));                                \
+    }                                                                     \
+    inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = __lsx_vld(ptr0, 0);                                 \
+        halfreg vhi = __lsx_vld(ptr1, 0);                                 \
+        return _Tpvec(_v256_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { __lasx_xvst(a.val, ptr, 0); }                                       \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            __lasx_xvst(a.val, ptr, 0); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            __lasx_xvst(a.val, ptr, 0); \
+        else \
+            __lasx_xvst(a.val, ptr, 0); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { __lsx_vst(_v256_extract_low(a.val), ptr, 0); }                      \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { __lsx_vst(_v256_extract_high(a.val), ptr, 0); }
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float32x8, float, __m128i)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_FLT(v_float64x4, double, __m128i)
+
+
+inline __m256i _lasx_256_castps_si256(const __m256& v)
+{ return __m256i(v); }
+
+inline __m256i _lasx_256_castpd_si256(const __m256d& v)
+{ return __m256i(v); }
+
+#define OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
+    inline _Tpvec v256_setzero_##suffix()                                         \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                                   \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                                     \
+    { return _Tpvec(__lasx_xvreplgr2vr_##ssuffix((ctype_s)v)); }                  \
+    template <> inline _Tpvec v_setzero_()                                        \
+    { return v256_setzero_##suffix(); }                                           \
+    template <> inline _Tpvec v_setall_(_Tp v)                                    \
+    { return v256_setall_##suffix(v); }                                           \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float32x8, suffix, _lasx_256_castps_si256) \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_float64x4, suffix, _lasx_256_castpd_si256)
+
+OPENCV_HAL_IMPL_LASX_INIT(v_uint8x32,  uchar,    u8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int8x32,   schar,    s8,  b,   int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint16x16, ushort,   u16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int16x16,  short,    s16, h,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint32x8,  unsigned, u32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int32x8,   int,      s32, w,  int)
+OPENCV_HAL_IMPL_LASX_INIT(v_uint64x4,  uint64,   u64, d, long int)
+OPENCV_HAL_IMPL_LASX_INIT(v_int64x4,   int64,    s64, d, long int)
+
+
+inline __m256 _lasx_256_castsi256_ps(const __m256i &v)
+{ return __m256(v); }
+
+inline __m256d _lasx_256_castsi256_pd(const __m256i &v)
+{ return __m256d(v); }
+
+#define OPENCV_HAL_IMPL_LASX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v256_setzero_##suffix()                                 \
+    { return _Tpvec(__lasx_xvreplgr2vr_d(0)); }                           \
+    inline _Tpvec v256_setall_##suffix(_Tp v)                             \
+    { return _Tpvec(_v256_setall_##zsuffix(v)); }                         \
+    template <> inline _Tpvec v_setzero_()                                \
+    { return v256_setzero_##suffix(); }                                   \
+    template <> inline _Tpvec v_setall_(_Tp v)                            \
+    { return v256_setall_##suffix(v); }                                   \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
+    OPENCV_HAL_IMPL_LASX_CAST(_Tpvec, v_int64x4,   suffix, cast)
+
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float32x8, float,  f32, ps, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_INIT_FLT(v_float64x4, double, f64, pd, _lasx_256_castsi256_pd)
+
+inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
+{ return a; }
+inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
+{ return v_float32x8(_lasx_256_castps_si256(__m256(a.val))); }
+
+inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
+{ return a; }
+inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
+{ return v_float64x4(_lasx_256_castpd_si256(__m256d(a.val))); }
+
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LASX_UNPACK(_Tpvec, suffix)                 \
+    inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvl_##suffix(__m256i(b.val), __m256i(a.val))); }        \
+    inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)   \
+    { return _Tpvec(__lasx_xvilvh_##suffix(__m256i(b.val), __m256i(a.val))); }
+
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint8x32,  b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int8x32,   b)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint16x16, h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int16x16,  h)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int32x8,   w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_uint64x4,  d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_int64x4,   d)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float32x8, w)
+OPENCV_HAL_IMPL_LASX_UNPACK(v_float64x4, d)
+
+
+// shuffle
+// todo: emulate 64bit
+#define OPENCV_HAL_IMPL_LASX_SHUFFLE(_Tpvec, intrin)  \
+    template<int m>                                  \
+    inline _Tpvec v256_shuffle(const _Tpvec& a)      \
+    { return _Tpvec(__lasx_xvshuf4i_##intrin(a.val, m)); }
+
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_uint32x8,  w)
+OPENCV_HAL_IMPL_LASX_SHUFFLE(v_int32x8,   w)
+
+template<int m>
+inline v_float32x8 v256_shuffle(const v_float32x8 &a)
+{ return v_float32x8(__lasx_xvshuf4i_w(*((__m256i*)&a.val), m)); }
+
+template<int m>
+inline v_float64x4 v256_shuffle(const v_float64x4 &a)
+{
+    const int m1 = m & 0b1;
+    const int m2 = m & 0b10;
+    const int m3 = m & 0b100;
+    const int m4 = m & 0b1000;
+    const int m5 = m2 << 1;
+    const int m6 = m3 << 2;
+    const int m7 = m4 << 3;
+    const int m8 = m1 & m5 & m6 & m7;
+
+    return v_float64x4(__lasx_xvshuf4i_d(*((__m256i*)&a.val), *((__m256i*)&a.val), m8));
+}
+
+template<typename _Tpvec>
+inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
+{
+    ab0 = v256_unpacklo(a, b);
+    ab1 = v256_unpackhi(a, b);
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
+{ return v_float32x8(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvpermi_q(a.val, b.val, 0x12)); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
+{ return v256_permute2x128<0x03>(a, b); }
+
+inline __m256i _v256_alignr_b(const __m256i &a, const __m256i &b, const int imm)
+{
+    if (imm == 8) {
+        return __lasx_xvshuf4i_d(b, a, 0x9); // b.d1 a.d0 b.d3 a.d2
+    } else {
+        __m256i byteIndex = _v256_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        return __lasx_xvshuf_b(a, b, __lasx_xvadd_b(__lasx_xvreplgr2vr_b(imm), byteIndex));
+    }
+}
+
+template<typename _Tpvec>
+inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
+{ return _Tpvec(_v256_alignr_b(a.val, b.val, 8)); }
+inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
+{ return v_float64x4(__lasx_xvshuf4i_d(b.val, a.val, 0x9)); } // b.d1 a.d0 b.d3 a.d2
+// todo: emulate float32
+
+template<typename _Tpvec>
+inline _Tpvec v256_swap_halves(const _Tpvec& a)
+{ return v256_permute2x128<1>(a, a); }
+
+template<typename _Tpvec>
+inline _Tpvec v256_reverse_64(const _Tpvec& a)
+{ return v256_permute4x64<0x1b>(a); }
+
+
+// ZIP
+#define OPENCV_HAL_IMPL_LASX_ZIP(_Tpvec)                             \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
+    { return v256_permute2x128<0x02>(a, b); }                        \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
+    { return v256_permute2x128<0x13>(a, b); }                        \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
+                             _Tpvec& c, _Tpvec& d)                   \
+    {                                                                \
+        _Tpvec a1b0 = v256_alignr_128(a, b);                         \
+        c = v256_combine_diagonal(a, a1b0);                          \
+        d = v256_combine_diagonal(a1b0, b);                          \
+    }                                                                \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
+                      _Tpvec& ab0, _Tpvec& ab1)                      \
+    {                                                                \
+        _Tpvec ab0ab2, ab1ab3;                                       \
+        v256_zip(a, b, ab0ab2, ab1ab3);                              \
+        v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
+    }
+
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_int64x4)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float32x8)
+OPENCV_HAL_IMPL_LASX_ZIP(v_float64x4)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LASX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)            \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint8x32,  __lasx_xvsadd_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint8x32,  __lasx_xvssub_bu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int8x32,   __lasx_xvsadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int8x32,   __lasx_xvssub_b)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint16x16, __lasx_xvsadd_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint16x16, __lasx_xvssub_hu)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int16x16,  __lasx_xvsadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int16x16,  __lasx_xvssub_h)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint32x8,  __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint32x8,  __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_uint32x8,  __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int32x8,   __lasx_xvadd_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int32x8,   __lasx_xvsub_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_int32x8,   __lasx_xvmul_w)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_uint64x4,  __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_uint64x4,  __lasx_xvsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_int64x4,   __lasx_xvadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_int64x4,   __lasx_xvsub_d)
+
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float32x8, __lasx_xvfadd_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float32x8, __lasx_xvfsub_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float32x8, __lasx_xvfmul_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float32x8, __lasx_xvfdiv_s)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_add, v_float64x4, __lasx_xvfadd_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_sub, v_float64x4, __lasx_xvfsub_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_mul, v_float64x4, __lasx_xvfmul_d)
+OPENCV_HAL_IMPL_LASX_BIN_OP(v_div, v_float64x4, __lasx_xvfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x32 v_mul(const v_uint8x32& a, const v_uint8x32& b)
+{
+    v_uint16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x32 v_mul(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int16x16 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x16 v_mul(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_hu(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_uint16x16(_v256_packs_epu32(p0, p1));
+}
+inline v_int16x16 v_mul(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i pl = __lasx_xvmul_h(a.val, b.val);
+    __m256i ph = __lasx_xvmuh_h(a.val, b.val);
+    __m256i p0 = __lasx_xvilvl_h(ph, pl);
+    __m256i p1 = __lasx_xvilvh_h(ph, pl);
+    return v_int16x16(_lasx_packs_w(p0, p1));
+}
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LASX_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)    \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint8x32,  __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int8x32,   __lasx_xvadd_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_uint16x16, __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_add_wrap, v_int16x16,  __lasx_xvadd_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint8x32,  __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int8x32,   __lasx_xvsub_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_uint16x16, __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_sub_wrap, v_int16x16,  __lasx_xvsub_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_uint16x16, __lasx_xvmul_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_mul_wrap, v_int16x16,  __lasx_xvmul_h)
+
+inline v_uint8x32 v_mul_wrap(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i p0 = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i p1 = __lasx_xvmulwod_h_bu(a.val, b.val);
+    return v_uint8x32(__lasx_xvpackev_b(p1, p0));
+}
+
+inline v_int8x32 v_mul_wrap(const v_int8x32& a, const v_int8x32& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x32& a, const v_uint8x32& b,
+                         v_uint16x16& c, v_uint16x16& d)
+{
+    v_uint16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x32& a, const v_int8x32& b,
+                         v_int16x16& c, v_int16x16& d)
+{
+    v_int16x16 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
+                         v_int32x8& c, v_int32x8& d)
+{
+    v_int16x16 vhi = v_int16x16(__lasx_xvmuh_h(a.val, b.val));
+
+    v_int16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
+                         v_uint32x8& c, v_uint32x8& d)
+{
+    v_uint16x16 vhi = v_uint16x16(__lasx_xvmuh_hu(a.val, b.val));
+
+    v_uint16x16 v0, v1;
+    v_zip(v_mul_wrap(a, b), vhi, v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
+                         v_uint64x4& c, v_uint64x4& d)
+{
+    __m256i v0 = __lasx_xvmulwev_d_wu(a.val, b.val);
+    __m256i v1 = __lasx_xvmulwod_d_wu(a.val, b.val);
+    v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
+}
+
+inline v_int16x16 v_mul_hi(const v_int16x16& a, const v_int16x16& b) { return v_int16x16(__lasx_xvmuh_h(a.val, b.val)); }
+inline v_uint16x16 v_mul_hi(const v_uint16x16& a, const v_uint16x16& b) { return v_uint16x16(__lasx_xvmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LASX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                               \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                               \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                                               \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                               \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }                            \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(__lasx_xvsll_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                                        \
+    { return _Tpuvec(__lasx_xvsrl_##suffix(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }           \
+    template<int imm>                                                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                                        \
+    { return _Tpsvec(srai(a.val, __lasx_xvreplgr2vr_##suffix(imm))); }
+
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint16x16, v_int16x16, h, __lasx_xvsra_h)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint32x8,  v_int32x8,  w, __lasx_xvsra_w)
+OPENCV_HAL_IMPL_LASX_SHIFT_OP(v_uint64x4,  v_int64x4,  d, __lasx_xvsra_d)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LASX_LOGIC_OP(_Tpvec, suffix, not_const)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix)  \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix)    \
+    OPENCV_HAL_IMPL_LASX_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix)  \
+    inline _Tpvec v_not(const _Tpvec& a)                               \
+    { return _Tpvec(__lasx_xvnori_b(a.val, 0)); }
+
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint8x32,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int8x32,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint16x16,  v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int16x16,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint32x8,   v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int32x8,    v, __lasx_xvreplgr2vr_w(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_uint64x4,   v, __lasx_xvreplgr2vr_d(-1))
+OPENCV_HAL_IMPL_LASX_LOGIC_OP(v_int64x4,    v, __lasx_xvreplgr2vr_d(-1))
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)                         \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                                      \
+    { return _Tpvec(intrin(*((__m256i*)(&a.val)), *((__m256i*)(&b.val)))); }
+
+#define OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(_Tpvec, suffix, not_const, cast)       \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_and, _Tpvec, __lasx_xvand_##suffix, cast)  \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_or, _Tpvec, __lasx_xvor_##suffix, cast)    \
+    OPENCV_HAL_IMPL_LASX_FLOAT_BIN_OP(v_xor, _Tpvec, __lasx_xvxor_##suffix, cast)  \
+    inline _Tpvec v_not(const _Tpvec& a)                                           \
+    { return _Tpvec(__lasx_xvxor_##suffix(*((__m256i*)(&a.val)), not_const)); }
+
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float32x8,  v, __lasx_xvreplgr2vr_w(-1), _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_FLOAT_LOGIC_OP(v_float64x4,  v, __lasx_xvreplgr2vr_d(-1), _lasx_256_castsi256_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LASX_SELECT(_Tpvec)                                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(__lasx_xvbitsel_v(b.val, a.val, mask.val)); }
+
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_SELECT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_SELECT(v_int32x8)
+
+inline v_float32x8 v_select(const v_float32x8 &mask, const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+inline v_float64x4 v_select(const v_float64x4 &mask, const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvbitsel_v(*((__m256i*)&b.val), *((__m256i*)&a.val), *((__m256i*)&mask.val))); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpvec)                     \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_not(v_eq(a, b)); }                                  \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_gt(b, a); }                                         \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_not(v_lt(a, b)); }                                  \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_ge(b, a); }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)   \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
+    { return _Tpuvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
+    {                                                                        \
+        return _Tpuvec(__lasx_xvslt_##usuffix(b.val, a.val));                \
+    }                                                                        \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
+    { return _Tpsvec(__lasx_xvseq_##suffix(a.val, b.val)); }                 \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
+    { return _Tpsvec(__lasx_xvslt_##suffix(b.val, a.val)); }                 \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpuvec)                                  \
+    OPENCV_HAL_IMPL_LASX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint8x32,  v_int8x32,  b, bu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint16x16, v_int16x16, h, hu)
+OPENCV_HAL_IMPL_LASX_CMP_OP_INT(v_uint32x8,  v_int32x8,  w, wu)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(_Tpvec, suffix)         \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
+    { return _Tpvec(__lasx_xvseq_##suffix(a.val, b.val)); }       \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
+    { return v_not(v_eq(a, b)); }
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_uint64x4, d)
+OPENCV_HAL_IMPL_LASX_CMP_OP_64BIT(v_int64x4, d)
+
+#define OPENCV_HAL_IMPL_LASX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)    \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(__lasx_##suffix##_##ssuffix(a.val, b.val)); }
+
+#define OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(_Tpvec, ssuffix)              \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_eq, xvfcmp_ceq, _Tpvec, ssuffix)   \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_ne, xvfcmp_cne, _Tpvec, ssuffix)   \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_lt,  xvfcmp_clt, _Tpvec, ssuffix)  \
+    OPENCV_HAL_IMPL_LASX_CMP_FLT(v_le, xvfcmp_cle, _Tpvec, ssuffix)
+
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_CMP_OP_FLT(v_float64x4, d)
+
+inline v_float32x8 v_gt(const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x8 v_ge(const v_float32x8 &a, const v_float32x8 &b)
+{ return v_float32x8(__lasx_xvfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x4 v_gt(const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x4 v_ge(const v_float64x4 &a, const v_float64x4 &b)
+{ return v_float64x4(__lasx_xvfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(__lasx_xvfcmp_cor_s(a.val, a.val)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(__lasx_xvfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint8x32,  __lasx_xvmin_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint8x32,  __lasx_xvmax_bu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int8x32,   __lasx_xvmin_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int8x32,   __lasx_xvmax_b)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint16x16, __lasx_xvmin_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint16x16, __lasx_xvmax_hu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int16x16,  __lasx_xvmin_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int16x16,  __lasx_xvmax_h)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_uint32x8,  __lasx_xvmin_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_uint32x8,  __lasx_xvmax_wu)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_int32x8,   __lasx_xvmin_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_int32x8,   __lasx_xvmax_w)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float32x8, __lasx_xvfmin_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float32x8, __lasx_xvfmax_s)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_min, v_float64x4, __lasx_xvfmin_d)
+OPENCV_HAL_IMPL_LASX_BIN_FUNC(v_max, v_float64x4, __lasx_xvfmax_d)
+
+/** Rotate **/
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_R = (16 - imm) & 0xFF};
+    enum {IMM_R2 = (32 - imm) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x21>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swap, IMM_R));
+    return v_uint8x32(_v256_alignr_b(swap, b.val, IMM_R2)); // imm < 32
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
+{
+    enum {IMM_L = (imm - 16) & 0xFF};
+
+    if (imm == 0)  return a;
+    if (imm == 32) return b;
+    if (imm > 32)  return v_uint8x32();
+
+    __m256i swap = _v256_permute2x128<0x03>(a.val, b.val);
+    if (imm == 16) return v_uint8x32(swap);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swap, a.val, imm));
+    return v_uint8x32(_v256_alignr_b(b.val, swap, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
+{
+    enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
+    enum {IMM_R = (16 - imm) & 0xFF};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(a.val, vzero, 0x20);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(a.val, swapz, IMM_R));
+    return v_uint8x32(__lasx_xvbsll_v(swapz, IMM_L));
+}
+
+template<int imm>
+inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
+{
+    enum {IMM_L = ((imm - 16) & 0xFF) > 31 ? 31 : ((imm - 16) & 0xFF)};
+
+    if (imm == 0) return a;
+    if (imm > 32) return v_uint8x32();
+
+    // ESAC control[3] ? [127:0] = 0
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i swapz = __lasx_xvpermi_q(vzero, a.val, 0x21);;
+    if (imm == 16) return v_uint8x32(swapz);
+    if (imm < 16)  return v_uint8x32(_v256_alignr_b(swapz, a.val, imm));
+    return v_uint8x32(__lasx_xvbsrl_v(swapz, IMM_L));
+}
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE_CAST(intrin, _Tpvec, cast)    \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)        \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a),    \
+                                       v_reinterpret_as_u8(b));   \
+        return _Tpvec(cast(ret.val));                             \
+    }                                                             \
+    template<int imm>                                             \
+    inline _Tpvec intrin(const _Tpvec& a)                         \
+    {                                                             \
+        enum {IMMxW = imm * sizeof(typename _Tpvec::lane_type)};  \
+        v_uint8x32 ret = intrin<IMMxW>(v_reinterpret_as_u8(a));   \
+        return _Tpvec(cast(ret.val));                             \
+    }
+
+#define OPENCV_HAL_IMPL_LASX_ROTATE(_Tpvec)                                  \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
+    OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
+
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int8x32)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int16x16)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int32x8)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_ROTATE(v_int64x4)
+
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float32x8, _lasx_256_castsi256_ps)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_left,  v_float64x4, _lasx_256_castsi256_pd)
+OPENCV_HAL_IMPL_LASX_ROTATE_CAST(v_rotate_right, v_float64x4, _lasx_256_castsi256_pd)
+
+/** Reverse **/
+inline v_uint8x32 v_reverse(const v_uint8x32 &a)
+{
+    static const __m256i perm = _v256_setr_b(
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+            15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m256i vec = __lasx_xvshuf_b(a.val, a.val, perm);
+    return v_uint8x32(__lasx_xvpermi_q(vec, vec, 1));
+}
+
+inline v_int8x32 v_reverse(const v_int8x32 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x16 v_reverse(const v_uint16x16 &a)
+{
+    __m256i vec = __lasx_xvshuf4i_h(a.val, 0x1B);
+    vec = __lasx_xvshuf4i_w(vec, 0x4E);
+    return v_uint16x16(__lasx_xvpermi_d(vec, 0x4E));
+}
+
+inline v_int16x16 v_reverse(const v_int16x16 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x8 v_reverse(const v_uint32x8 &a)
+{
+    __m256i vec = __lasx_xvshuf4i_w(a.val, 0x1B);
+    return v_uint32x8(__lasx_xvpermi_d(vec, 0x4E));
+}
+
+inline v_int32x8 v_reverse(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x8 v_reverse(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x4 v_reverse(const v_uint64x4 &a)
+{
+    return v_uint64x4(__lasx_xvpermi_d(a.val, 0x1b));
+}
+
+inline v_int64x4 v_reverse(const v_int64x4 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x4 v_reverse(const v_float64x4 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce and mask /////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_hu_bu(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+
+inline int v_reduce_sum(const v_int8x32& a)
+{
+    __m256i t1 = __lasx_xvhaddw_h_b(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_w_h(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_d_w(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_q_d(t3, t3);
+    return (int)(((v8i32)t4)[0]+((v8i32)t4)[4]);
+}
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_32(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a) \
+    { \
+        __m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
+        val = intrin(val, __lsx_vbsrl_v(val,8));    \
+        val = intrin(val, __lsx_vbsrl_v(val,4));    \
+        val = intrin(val, __lsx_vbsrl_v(val,2));    \
+        val = intrin(val, __lsx_vbsrl_v(val,1));    \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_uint8x32, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LASX_REDUCE_32(v_int8x32,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_16(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                   \
+    {                                                                \
+        __m128i v0 = _v256_extract_low(a.val);                       \
+        __m128i v1 = _v256_extract_high(a.val);                      \
+        v0 = intrin(v0, v1);                                         \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                       \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 2));                       \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                   \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_uint16x16, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LASX_REDUCE_16(v_int16x16,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_8(_Tpvec, sctype, func, intrin) \
+    inline sctype v_reduce_##func(const _Tpvec& a)                  \
+    {                                                               \
+        __m128i v0 = _v256_extract_low(a.val);                      \
+        __m128i v1 = _v256_extract_high(a.val);                     \
+        v0 = intrin(v0, v1);                                        \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 8));                      \
+        v0 = intrin(v0, __lsx_vbsrl_v(v0, 4));                      \
+        return (sctype) __lsx_vpickve2gr_w(v0, 0);                  \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_uint32x8, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LASX_REDUCE_8(v_int32x8,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LASX_REDUCE_FLT(func, intrin)                 \
+    inline float v_reduce_##func(const v_float32x8& a)                \
+    {                                                                 \
+        __m128 v0 = _v256_extract_low(a.val);                         \
+        __m128 v1 = _v256_extract_high(a.val);                        \
+        v0 = intrin(v0, v1);                                          \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x0e))); \
+        v0 = intrin(v0, __m128(__lsx_vpermi_w(*((__m128i*)&v0), *((__m128i*)&v0), 0x01))); \
+        float *fvalue = (float*)&v0;                                  \
+        return fvalue[0];                                             \
+    }
+
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LASX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x8& a)
+{
+    __m256i t1 = __lasx_xvhaddw_d_w(a.val, a.val);
+    __m256i t2 = __lasx_xvhaddw_q_d(t1, t1);
+    return (int)(((v8i32)t2)[0]+((v8i32)t2)[4]);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x8& a)
+{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+inline int v_reduce_sum(const v_int16x16& a)
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+inline unsigned v_reduce_sum(const v_uint16x16& a)
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
+
+inline float v_reduce_sum(const v_float32x8& a)
+{
+    float result = 0;
+    float *pa = (float*)&a;
+    for (int i = 0; i < 2; ++i) {
+        result += pa[i*4] + pa[i*4+1] + pa[i*4+2] + pa[i*4+3];
+    }
+    return result;
+}
+
+inline uint64 v_reduce_sum(const v_uint64x4& a)
+{
+    __m256i t0 = __lasx_xvhaddw_qu_du(a.val, a.val);
+    return (uint64)(((v4u64)t0)[0] + ((v4u64)t0)[2]);
+}
+inline int64 v_reduce_sum(const v_int64x4& a)
+{
+    __m256i t0 = __lasx_xvhaddw_q_d(a.val, a.val);
+    return (int64)(((v4i64)t0)[0] + ((v4i64)t0)[2]);
+}
+inline double v_reduce_sum(const v_float64x4& a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1] + pa[2] + pa[3];
+}
+
+inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
+                                 const v_float32x8& c, const v_float32x8& d)
+{
+    float *pa = (float*)&a;
+    float *pb = (float*)&b;
+    float *pc = (float*)&c;
+    float *pd = (float*)&d;
+
+    float v0 = pa[0] + pa[1] + pa[2] + pa[3];
+    float v1 = pb[0] + pb[1] + pb[2] + pb[3];
+    float v2 = pc[0] + pc[1] + pc[2] + pc[3];
+    float v3 = pd[0] + pd[1] + pd[2] + pd[3];
+    float v4 = pa[4] + pa[5] + pa[6] + pa[7];
+    float v5 = pb[4] + pb[5] + pb[6] + pb[7];
+    float v6 = pc[4] + pc[5] + pc[6] + pc[7];
+    float v7 = pd[4] + pd[5] + pd[6] + pd[7];
+    return v_float32x8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_bu(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i t0 = __lasx_xvabsd_b(a.val, b.val);
+    __m256i t1 = __lasx_xvhaddw_hu_bu(t0, t0);
+    __m256i t2 = __lasx_xvhaddw_wu_hu(t1, t1);
+    __m256i t3 = __lasx_xvhaddw_du_wu(t2, t2);
+    __m256i t4 = __lasx_xvhaddw_qu_du(t3, t3);
+    return (unsigned)(((v8u32)t4)[0]+((v8u32)t4)[4]);
+}
+inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_add_wrap(v_sub(a, b), v_sub(b, a)), l, h);
+    return v_reduce_sum(v_add(l, h));
+}
+inline unsigned v_reduce_sad(const v_int16x16& a, const v_int16x16& b)
+{
+    v_uint32x8 l, h;
+    v_expand(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))), l, h);
+    return v_reduce_sum(v_add(l, h));
+}
+inline unsigned v_reduce_sad(const v_uint32x8& a, const v_uint32x8& b)
+{
+    return v_reduce_sum(v_sub(v_max(a, b), v_min(a, b)));
+}
+inline unsigned v_reduce_sad(const v_int32x8& a, const v_int32x8& b)
+{
+    v_int32x8 m = v_lt(a, b);
+    return v_reduce_sum(v_reinterpret_as_u32(v_sub(v_xor(v_sub(a, b), m), m)));
+}
+inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
+{
+    v_float32x8 a_b = v_sub(a, b);
+    return v_reduce_sum(v_float32x8(*((__m256i*)&a_b.val) & __lasx_xvreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+inline v_uint8x32 v_popcount(const v_uint8x32& a)
+{ return v_uint8x32(__lasx_xvpcnt_b(a.val)); }
+inline v_uint16x16 v_popcount(const v_uint16x16& a)
+{ return v_uint16x16(__lasx_xvpcnt_h(a.val)); }
+inline v_uint32x8 v_popcount(const v_uint32x8& a)
+{ return v_uint32x8(__lasx_xvpcnt_w(a.val)); }
+inline v_uint64x4 v_popcount(const v_uint64x4& a)
+{ return v_uint64x4(__lasx_xvpcnt_d(a.val)); }
+inline v_uint8x32 v_popcount(const v_int8x32& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x16 v_popcount(const v_int16x16& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x8 v_popcount(const v_int32x8& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x4 v_popcount(const v_int64x4& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+inline int v_signmask(const v_int8x32& a)
+{
+    __m256i result = __lasx_xvmskltz_b(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 16);
+    return mask;
+}
+inline int v_signmask(const v_uint8x32& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+
+inline int v_signmask(const v_int16x16& a)
+{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
+inline int v_signmask(const v_uint16x16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_int32x8& a)
+{
+    __m256i result = __lasx_xvmskltz_w(a.val);
+    int mask = __lasx_xvpickve2gr_w(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 4);
+    return mask;
+}
+inline int v_signmask(const v_uint32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_int64x4& a)
+{
+    __m256i result = __lasx_xvmskltz_d(a.val);
+    int mask = __lasx_xvpickve2gr_d(result, 0);
+    mask |= (__lasx_xvpickve2gr_w(result, 4) << 2);
+    return mask;
+}
+inline int v_signmask(const v_uint64x4& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+
+inline int v_signmask(const v_float32x8& a)
+{ return v_signmask(*(v_int32x8*)(&a)); }
+
+inline int v_signmask(const v_float64x4& a)
+{ return v_signmask(*(v_int64x4*)(&a)); }
+
+inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LASX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int8x32, -1)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_uint64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_int64x4, 15)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float32x8, 255)
+OPENCV_HAL_IMPL_LASX_CHECK(v_float64x4, 15)
+
+#define OPENCV_HAL_IMPL_LASX_CHECK_SHORT(_Tpvec)  \
+    inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
+    inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_CHECK_SHORT(v_int16x16)
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LASX_MULADD(_Tpvec, suffix)                            \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)     \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)  \
+    { return _Tpvec(__lasx_xvfmadd_##suffix(a.val, b.val, c.val)); }           \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                      \
+    { return _Tpvec(__lasx_xvfsqrt_##suffix(x.val)); }                         \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)            \
+    { return v_fma(a, a, v_mul(b, b)); }                                       \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                \
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+
+OPENCV_HAL_IMPL_LASX_MULADD(v_float32x8, s)
+OPENCV_HAL_IMPL_LASX_MULADD(v_float64x4, d)
+
+inline v_int32x8 v_fma(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_int32x8(__lasx_xvmadd_w(c.val, a.val, b.val));
+}
+
+inline v_int32x8 v_muladd(const v_int32x8& a, const v_int32x8& b, const v_int32x8& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x8 v_invsqrt(const v_float32x8& x)
+{ return v_float32x8(__lasx_xvfrsqrt_s(x.val)); }
+
+inline v_float64x4 v_invsqrt(const v_float64x4& x)
+{ return v_float64x4(__lasx_xvfrsqrt_d(x.val)); }
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LASX_ABS(_Tpvec, suffix)         \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)        \
+    { return v_u##_Tpvec(__lasx_xvabsd_##suffix(x.val, __lasx_xvreplgr2vr_w(0))); }
+
+OPENCV_HAL_IMPL_LASX_ABS(int8x32,  b)
+OPENCV_HAL_IMPL_LASX_ABS(int16x16, h)
+OPENCV_HAL_IMPL_LASX_ABS(int32x8,  w)
+
+inline v_float32x8 v_abs(const v_float32x8& x)
+{ return v_float32x8(*((__m256i*)&x) & __lasx_xvreplgr2vr_w(0x7fffffff)); }
+inline v_float64x4 v_abs(const v_float64x4& x)
+{ return v_float64x4(*((__m256i*)&x) & __lasx_xvreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
+{ return (v_uint8x32)__lasx_xvabsd_bu(a.val, b.val); }
+inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
+{ return (v_uint16x16)__lasx_xvabsd_hu(a.val, b.val); }
+inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
+{ return (v_uint32x8)__lasx_xvabsd_wu(a.val, b.val); }
+
+inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
+{ return (v_uint8x32)__lasx_xvabsd_b(a.val, b.val); }
+inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
+{ return (v_uint16x16)__lasx_xvabsd_h(a.val, b.val); }
+inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
+{ return (v_uint32x8)__lasx_xvabsd_w(a.val, b.val); }
+
+inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
+{ return v_abs(v_sub(a, b)); }
+
+inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
+{ return v_abs(v_sub(a, b)); }
+
+/** Saturating absolute difference **/
+inline v_int8x32 v_absdiffs(const v_int8x32& a, const v_int8x32& b)
+{
+    v_int8x32 d = v_sub(a, b);
+    v_int8x32 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
+}
+inline v_int16x16 v_absdiffs(const v_int16x16& a, const v_int16x16& b)
+{ return v_sub(v_max(a, b), v_min(a, b)); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x8 v_round(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftint_w_s(a.val)); }
+
+inline v_int32x8 v_round(const v_float64x4& a)
+{ __m256i t = __lasx_xvftint_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_round(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256i abi = __lasx_xvftint_w_d(b.val, a.val);
+    return v_int32x8(__lasx_xvpermi_d(abi, 0b11011000)); //3120
+}
+
+inline v_int32x8 v_trunc(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(a.val)); }
+
+inline v_int32x8 v_trunc(const v_float64x4& a)
+{ __m256i t = __lasx_xvftintrz_w_d(a.val, a.val);
+  return v_int32x8(__lasx_xvpermi_d(t, 0x88)); }
+
+inline v_int32x8 v_floor(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrm_s(a.val)))); }
+
+inline v_int32x8 v_floor(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrm_d(a.val))); }
+
+inline v_int32x8 v_ceil(const v_float32x8& a)
+{ return v_int32x8(__lasx_xvftintrz_w_s(__m256(__lasx_xvfrintrp_s(a.val)))); }
+
+inline v_int32x8 v_ceil(const v_float64x4& a)
+{ return v_trunc(v_float64x4(__lasx_xvfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x8 v_cvt_f32(const v_int32x8& a)
+{ return v_float32x8(__lasx_xvffint_s_w(a.val)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a)
+{ return v_float32x8(__lasx_xvpermi_d(__lasx_xvfcvt_s_d(a.val, a.val), 0x88)); }
+
+inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
+{
+    __m256 abf = __lasx_xvfcvt_s_d(a.val, b.val);  //warnning: order of a,b is diff from instruction xvfcvt.s.d
+    return v_float32x8(__lasx_xvpermi_d(abf, 0x8D));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvffintl_d_w(alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvffintl_d_w(ahigh));
+}
+
+inline v_float64x4 v_cvt_f64(const v_float32x8& a)
+{
+    __m256i alow = __lasx_xvpermi_d(a.val, 0x10);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)alow));
+}
+
+inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
+{
+    __m256i ahigh = __lasx_xvpermi_d(a.val, 0x32);
+    return v_float64x4(__lasx_xvfcvtl_d_s((__m256)ahigh));
+}
+
+inline v_float64x4 v_cvt_f64(const v_int64x4& v)
+{ return v_float64x4(__lasx_xvffint_d_l(v.val)); }
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x32 v256_lut(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_b(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]],
+                                  tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]],
+                                  tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]], tab[idx[16]], tab[idx[17]],
+                                  tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
+                                  tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]],
+                                  tab[idx[30]], tab[idx[31]]));
+}
+inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_h(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]),
+                                  *(const short*)(tab + idx[ 3]), *(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]),
+                                  *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]), *(const short*)(tab + idx[ 8]),
+                                  *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
+                                  *(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]),
+                                  *(const short*)(tab + idx[15])));
+}
+inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x32(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7])));
+}
+inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x16 v256_lut(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_h(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]],
+                                   tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], tab[idx[ 8]], tab[idx[ 9]],
+                                   tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]],
+                                   tab[idx[15]]));
+}
+inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                   *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                   *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                   *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x16(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                   *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+
+}
+inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x8 v256_lut(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                                  *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3]),
+                                  *(const int*)(tab + idx[4]), *(const int*)(tab + idx[5]),
+                                  *(const int*)(tab + idx[6]), *(const int*)(tab + idx[7]) ));
+}
+inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x8(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x4 v256_lut(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_setr_d(*(const long long int*)(tab + idx[0]), *(const long long int*)(tab + idx[1]),
+                                  *(const long long int*)(tab + idx[2]), *(const long long int*)(tab + idx[3]) ));
+}
+inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0)));
+}
+inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
+inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x8 v256_lut(const float* tab, const int* idx)
+{
+    return v_float32x8(_v256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
+inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x4 v256_lut(const double* tab, const int* idx)
+{
+    return v_float64x4(_v256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx)
+{ return v_float64x4(_v256_combine(__lsx_vld(tab + idx[0], 0), __lsx_vld(tab + idx[1], 0))); }
+
+inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_uint32x8 v_lut(const unsigned* tab, const v_int32x8& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v256_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy01, xy45, xy23, xy67;
+    xy01 = __lsx_vld(tab + idx[0], 0);
+    xy01 = __lsx_vextrins_d(xy01, __lsx_vld(tab + idx[1], 0), 0x10);
+    xy45 = __lsx_vld(tab + idx[4], 0);
+    xy45 = __lsx_vextrins_d(xy45, __lsx_vld(tab + idx[5], 0), 0x10);
+    __m256i xy0145 = _v256_combine(xy01, xy45);
+    xy23 = __lsx_vld(tab + idx[2], 0);
+    xy23 = __lsx_vextrins_d(xy23, __lsx_vld(tab + idx[3], 0), 0x10);
+    xy67 = __lsx_vld(tab + idx[6], 0);
+    xy67 = __lsx_vextrins_d(xy67, __lsx_vld(tab + idx[7], 0), 0x10);
+    __m256i xy2367 = _v256_combine(xy23, xy67);
+
+    __m256i xxyy0145 = __lasx_xvilvl_w(xy2367, xy0145);
+    __m256i xxyy2367 = __lasx_xvilvh_w(xy2367, xy0145);
+
+    x = v_float32x8(__lasx_xvilvl_w(xxyy2367, xxyy0145));
+    y = v_float32x8(__lasx_xvilvh_w(xxyy2367, xxyy0145));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
+{
+    //int CV_DECL_ALIGNED(32) idx[4];
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy2 = __lsx_vld(tab + idx[2], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    __m128i xy3 = __lsx_vld(tab + idx[3], 0);
+    __m256i xy02 = _v256_combine(xy0, xy2);
+    __m256i xy13 = _v256_combine(xy1, xy3);
+
+    x = v_float64x4(__lasx_xvilvl_d(xy13, xy02));
+    y = v_float64x4(__lasx_xvilvh_d(xy13, xy02));
+}
+
+inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
+}
+inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
+{
+    return v_int8x32(__lasx_xvshuf_b(vec.val, vec.val,
+                       _v256_set_d(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
+}
+inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
+}
+inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
+{
+    return v_int16x16(__lasx_xvshuf_b(vec.val, vec.val,
+                        _v256_set_d(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
+}
+inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_interleave_pairs(const v_float32x8& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x1211100f0e0d0c0a, 0x0908060504020100, 0x1211100f0e0d0c0a, 0x0908060504020100));
+    return v_int8x32(__lasx_xvperm_w(t1,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
+{
+    __m256i vzero = __lasx_xvreplgr2vr_w(0);
+    __m256i t1 = __lasx_xvshuf_b(vzero, vec.val,
+                   _v256_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100, 0x11100f0e0d0c0b0a, 0x0908050403020100));
+    return v_int16x16(__lasx_xvperm_w(t1,
+                        _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x8 v_pack_triplets(const v_int32x8& vec)
+{
+    return v_int32x8(__lasx_xvperm_w(vec.val,
+                       _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+inline v_uint32x8 v_pack_triplets(const v_uint32x8& vec)
+{ return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
+{
+    return v_float32x8(__lasx_xvperm_w(*(__m256i*)(&vec.val),
+                         _v256_set_d(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
+{ return v_int32x8(__lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val))); }
+
+inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_add(v_dotprod(a, b), c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
+{
+    __m256i even = __lasx_xvmulwev_d_w(a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}
+inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{
+    __m256i even = __lasx_xvmaddwev_d_w(c.val, a.val, b.val);
+    return v_int64x4(__lasx_xvmaddwod_d_w(even, a.val, b.val));
+}
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
+{
+    __m256i even  = __lasx_xvmulwev_h_bu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_bu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_wu_hu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_wu_hu(odd, odd);
+    return v_uint32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
+{
+    __m256i even  = __lasx_xvmulwev_h_b(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_h_b(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_w_h(even, even);
+    __m256i prod1 = __lasx_xvhaddw_w_h(odd, odd);
+    return v_int32x8(__lasx_xvadd_w(prod0, prod1));
+}
+inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(prod0, prod1));
+}
+inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_h(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_h(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_d_w(even, even);
+    __m256i prod1 = __lasx_xvhaddw_d_w(odd, odd);
+    return v_int64x4(__lasx_xvadd_d(prod0, prod1));
+}
+
+inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
+{ return v_dotprod(a, b); }
+inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod(a, b); }
+inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i even  = __lasx_xvmulwev_w_hu(a.val, b.val);
+    __m256i odd   = __lasx_xvmulwod_w_hu(a.val, b.val);
+    __m256i prod0 = __lasx_xvhaddw_du_wu(even, even);
+    __m256i prod1 = __lasx_xvhaddw_du_wu(odd, odd);
+    return v_uint64x4(__lasx_xvadd_d(__lasx_xvilvl_d(prod1, prod0), __lasx_xvilvh_d(prod1, prod0)));
+}
+inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i prod = __lasx_xvadd_w(__lasx_xvmulwev_w_h(a.val, b.val), __lasx_xvmulwod_w_h(a.val, b.val));
+    __m256i sign = __lasx_xvsrai_w(prod, 31);
+    __m256i lo = __lasx_xvilvl_w(sign, prod);
+    __m256i hi = __lasx_xvilvh_w(sign, prod);
+    return v_int64x4(__lasx_xvadd_d(lo, hi));
+}
+inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+// 32 >> 64f
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+
+#define OPENCV_HAL_LASX_SPLAT2_PS(a, im) \
+    v_float32x8(__lasx_xvpermi_w(a.val, a.val, im))
+
+inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
+                            const v_float32x8& m1, const v_float32x8& m2,
+                            const v_float32x8& m3)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    v_float32x8 v37 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xFF);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v_mul(v37, m3))));
+}
+
+inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
+                               const v_float32x8& m1, const v_float32x8& m2,
+                               const v_float32x8& a)
+{
+    v_float32x8 v04 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0);
+    v_float32x8 v15 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0x55);
+    v_float32x8 v26 = OPENCV_HAL_LASX_SPLAT2_PS(v, 0xAA);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(_Tpvec, cast_from, cast_to)           \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m256i t0 = cast_from(__lasx_xvilvl_w(a1.val, a0.val));                \
+        __m256i t1 = cast_from(__lasx_xvilvl_w(a3.val, a2.val));                \
+        __m256i t2 = cast_from(__lasx_xvilvh_w(a1.val, a0.val));                \
+        __m256i t3 = cast_from(__lasx_xvilvh_w(a3.val, a2.val));                \
+        b0.val = cast_to(__lasx_xvilvl_d(t1, t0));                              \
+        b1.val = cast_to(__lasx_xvilvh_d(t1, t0));                              \
+        b2.val = cast_to(__lasx_xvilvl_d(t3, t2));                              \
+        b3.val = cast_to(__lasx_xvilvh_d(t3, t2));                              \
+    }
+
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_uint32x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LASX_TRANSPOSE4x4(v_int32x8,  OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x8 &a0, const v_float32x8 &a1,
+                           const v_float32x8 &a2, const v_float32x8 &a3,
+                           v_float32x8 &b0, v_float32x8 &b1, v_float32x8 &b2, v_float32x8 &b3)
+{
+    __m256i t0 = __lasx_xvilvl_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t1 = __lasx_xvilvl_w(__m256i(a3.val), __m256i(a2.val));
+    __m256i t2 = __lasx_xvilvh_w(__m256i(a1.val), __m256i(a0.val));
+    __m256i t3 = __lasx_xvilvh_w(__m256i(a3.val), __m256i(a2.val));
+    b0.val = __m256(__lasx_xvilvl_d(t1, t0));
+    b1.val = __m256(__lasx_xvilvh_d(t1, t0));
+    b2.val = __m256(__lasx_xvilvl_d(t3, t2));
+    b3.val = __m256(__lasx_xvilvh_d(t3, t2));
+}
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LASX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)   \
+    {                                                                 \
+        b0.val = intrin(a.val);                                       \
+        b1.val = intrin(__lasx_xvpermi_q(a.val, a.val, 0x11));        \
+    }                                                                 \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                      \
+    { return _Tpwvec(intrin(a.val)); }                                \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                     \
+    { return _Tpwvec(intrin(__lasx_xvpermi_q(a.val, a.val, 0x11))); } \
+    inline _Tpwvec v256_load_expand(const _Tp* ptr)                   \
+    {                                                                 \
+        __m128i a = __lsx_vld(ptr, 0);                                \
+        return _Tpwvec(intrin(*((__m256i*)&a)));                      \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    __lasx_vext2xv_hu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int8x32,   v_int16x16,  schar,    __lasx_vext2xv_h_b)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   __lasx_vext2xv_wu_hu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int16x16,  v_int32x8,   short,    __lasx_vext2xv_w_h)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, __lasx_vext2xv_du_wu)
+OPENCV_HAL_IMPL_LASX_EXPAND(v_int32x8,   v_int64x4,   int,      __lasx_vext2xv_d_w)
+
+#define OPENCV_HAL_IMPL_LASX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
+    inline _Tpvec v256_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = __lsx_vld(ptr, 0);                       \
+        return _Tpvec(intrin(*((__m256i*)&a)));              \
+    }
+
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_uint32x8, uchar, __lasx_vext2xv_wu_bu)
+OPENCV_HAL_IMPL_LASX_EXPAND_Q(v_int32x8,  schar, __lasx_vext2xv_w_b)
+
+/* pack */
+// 16
+inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
+{ return v_int8x32(_v256_shuffle_odd_64(_lasx_packs_h(a.val, b.val))); }
+
+inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
+{ return v_uint8x32(_v256_shuffle_odd_64(__lasx_xvssrlrni_bu_h(b.val, a.val, 0))); }
+
+inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    return v_uint8x32(_v256_shuffle_odd_64(_lasx_packus_h(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar *ptr, const v_uint16x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i res = __lasx_xvssrlrni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
+{
+    __m256i res = __lasx_xvssrlrni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i res = __lasx_xvssrarni_bu_h(b.val, a.val, n);
+    return v_uint8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
+{
+    __m256i res = __lasx_xvssrarni_bu_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
+{
+    __m256i res = __lasx_xvssrarni_b_h(b.val, a.val, n);
+    return v_int8x32(_v256_shuffle_odd_64(res));
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
+{
+    __m256i res = __lasx_xvssrarni_b_h(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// 32
+inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(_lasx_packs_w(a.val, b.val))); }
+
+inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_v256_packs_epu32(a.val, b.val))); }
+
+inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(_lasx_packus_w(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, 0);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrlrni_hu_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
+{
+    __m256i res = __lasx_xvssrlrni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
+{ return v_uint16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_hu_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
+{
+    __m256i res = __lasx_xvssrarni_hu_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
+{ return v_int16x16(_v256_shuffle_odd_64(__lasx_xvssrarni_h_w(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x8& a)
+{
+    __m256i res = __lasx_xvssrarni_h_w(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
+{
+    __m256i ab = __lasx_xvpickev_w(b.val, a.val);
+    return v_uint32x8(_v256_shuffle_odd_64(ab));
+}
+
+inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i a0 = __lasx_xvshuf4i_w(a.val, 0x08);
+    v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x4& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
+{ return v_uint32x8(_v256_shuffle_odd_64(__lasx_xvsrlrni_w_d(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
+{
+    __m256i res = __lasx_xvsrlrni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+template<int n> inline
+v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
+{ return v_int32x8(_v256_shuffle_odd_64(__lasx_xvsrarni_w_d(b.val, a.val, n))); }
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x4& a)
+{
+    __m256i res = __lasx_xvsrarni_w_d(a.val, a.val, n);
+    __lasx_xvstelm_d(res, ptr, 0, 0);
+    __lasx_xvstelm_d(res, ptr, 8, 2);
+}
+
+// pack boolean
+inline v_uint8x32 v_pack_b(const v_uint16x16& a, const v_uint16x16& b)
+{
+    __m256i ab = _lasx_packs_h(a.val, b.val);
+    return v_uint8x32(_v256_shuffle_odd_64(ab));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint32x8& a, const v_uint32x8& b,
+                           const v_uint32x8& c, const v_uint32x8& d)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+
+    __m256i abcd = _v256_shuffle_odd_64(_lasx_packs_h(ab, cd));
+    return v_uint8x32(__lasx_xvshuf4i_w(abcd, 0xd8));
+}
+
+inline v_uint8x32 v_pack_b(const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                           const v_uint64x4& d, const v_uint64x4& e, const v_uint64x4& f,
+                           const v_uint64x4& g, const v_uint64x4& h)
+{
+    __m256i ab = _lasx_packs_w(a.val, b.val);
+    __m256i cd = _lasx_packs_w(c.val, d.val);
+    __m256i ef = _lasx_packs_w(e.val, f.val);
+    __m256i gh = _lasx_packs_w(g.val, h.val);
+
+    __m256i abcd = _lasx_packs_w(ab, cd);
+    __m256i efgh = _lasx_packs_w(ef, gh);
+    __m256i pkall = _v256_shuffle_odd_64(_lasx_packs_h(abcd, efgh));
+
+    __m256i rev = _v256_alignr_b(pkall, pkall, 8);
+    return v_uint8x32(__lasx_xvilvl_h(rev, pkall));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LASX_EXTRACT(_Tpvec)                    \
+    template<int s>                                             \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)   \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int8x32)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int16x16)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_uint64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_int64x4)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float32x8)
+OPENCV_HAL_IMPL_LASX_EXTRACT(v_float64x4)
+
+template<int i>
+inline uchar v_extract_n(v_uint8x32 a)
+{
+    return (uchar)_v256_extract_b<i>(a.val);
+}
+
+template<int i>
+inline schar v_extract_n(v_int8x32 a)
+{
+    return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
+}
+
+template<int i>
+inline ushort v_extract_n(v_uint16x16 a)
+{
+    return (ushort)_v256_extract_h<i>(a.val);
+}
+
+template<int i>
+inline short v_extract_n(v_int16x16 a)
+{
+    return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
+}
+
+template<int i>
+inline uint v_extract_n(v_uint32x8 a)
+{
+    return (uint)_v256_extract_w<i>(a.val);
+}
+
+template<int i>
+inline int v_extract_n(v_int32x8 a)
+{
+    return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
+}
+
+template<int i>
+inline uint64 v_extract_n(v_uint64x4 a)
+{
+    return (uint64)_v256_extract_d<i>(a.val);
+}
+
+template<int i>
+inline int64 v_extract_n(v_int64x4 v)
+{
+    return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
+}
+
+template<int i>
+inline float v_extract_n(v_float32x8 v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(v_float64x4 v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
+{
+    static const __m256i perm = __lasx_xvreplgr2vr_w((char)i);
+    return v_uint32x8(__lasx_xvperm_w(a.val, perm));
+}
+
+template<int i>
+inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_b(t1, t0);
+    __m256i p1 = __lasx_xvpickod_b(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_h(t1, t0);
+    __m256i p1 = __lasx_xvpickod_h(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+
+    __m256i p0 = __lasx_xvpickev_w(t1, t0);
+    __m256i p1 = __lasx_xvpickod_w(t1, t0);
+
+    a.val = __lasx_xvpermi_d(p0, 0xd8);
+    b.val = __lasx_xvpermi_d(p1, 0xd8);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
+{
+    __m256i ab0 = __lasx_xvld(ptr, 0);
+    __m256i ab1 = __lasx_xvld(ptr, 32);
+
+    __m256i pl = __lasx_xvpermi_q(ab0, ab1, 0x02);
+    __m256i ph = __lasx_xvpermi_q(ab0, ab1, 0x13);
+    __m256i a0 = __lasx_xvilvl_d(ph, pl);
+    __m256i b0 = __lasx_xvilvh_d(ph, pl);
+    a = v_uint64x4(a0);
+    b = v_uint64x4(b0);
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+
+    const __m256i
+    sh_b = _v256_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
+                        0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
+    sh_g = _v256_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
+                        1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
+    sh_r = _v256_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
+                        2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint8x32(b0);
+    b = v_uint8x32(g0);
+    c = v_uint8x32(r0);
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m0), bgr1, m1);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m0), s02_high, m1);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m1), bgr1, m0);
+    const __m256i sh_b = _v256_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+                                      0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
+                                      2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m256i sh_r = _v256_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+                                      4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    b0 = __lasx_xvshuf_b(b0, b0, sh_b);
+    g0 = __lasx_xvshuf_b(g0, g0, sh_g);
+    r0 = __lasx_xvshuf_b(r0, r0, sh_r);
+
+    a = v_uint16x16(b0);
+    b = v_uint16x16(g0);
+    c = v_uint16x16(r0);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s02_low = __lasx_xvpermi_q(bgr0, bgr2, 0x02);
+    __m256i s02_high = __lasx_xvpermi_q(bgr0, bgr2, 0x13);
+
+    __m256i m24 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+    __m256i m92 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i b0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_low, s02_high, m24), bgr1, m92);
+    __m256i g0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(s02_high, s02_low, m92), bgr1, m24);
+    __m256i r0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(bgr1, s02_low, m24), s02_high, m92);
+
+    b0 = __lasx_xvshuf4i_w(b0, 0x6c);
+    g0 = __lasx_xvshuf4i_w(g0, 0xb1);
+    r0 = __lasx_xvshuf4i_w(r0, 0xc6);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c )
+{
+    __m256i bgr0 = __lasx_xvld(ptr, 0);
+    __m256i bgr1 = __lasx_xvld(ptr, 32);
+    __m256i bgr2 = __lasx_xvld(ptr, 64);
+
+    __m256i s01 = __lasx_xvpermi_q(bgr0, bgr1, 0x12); // get bgr0 low 128 and bgr1 high 128
+    __m256i s12 = __lasx_xvpermi_q(bgr1, bgr2, 0x12);
+    __m256i s20r = __lasx_xvpermi_d(__lasx_xvpermi_q(bgr2, bgr0, 0x12), 0x1b);
+    __m256i b0 = __lasx_xvilvl_d(s20r, s01);
+    __m256i g0 = _v256_alignr_b(s12, s01, 8);
+    __m256i r0 = __lasx_xvilvh_d(s12, s20r);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x32& a, v_uint8x32& b, v_uint8x32& c, v_uint8x32& d)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
+
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_b(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_b(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_b(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_b(t3, t2);
+
+    __m256i a_pre = __lasx_xvpickev_b(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_b(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_b(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_b(bd_hi, bd_lo);
+
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x16& a, v_uint16x16& b, v_uint16x16& c, v_uint16x16& d)
+{
+    __m256i t0 = __lasx_xvld(ptr, 0);
+    __m256i t1 = __lasx_xvld(ptr, 32);
+    __m256i t2 = __lasx_xvld(ptr, 64);
+    __m256i t3 = __lasx_xvld(ptr, 96);
+
+    const __m256i sh = _v256_setr_w(0, 4, 1, 5, 2, 6, 3, 7);
+    __m256i ac_lo = __lasx_xvpickev_h(t1, t0);
+    __m256i bd_lo = __lasx_xvpickod_h(t1, t0);
+    __m256i ac_hi = __lasx_xvpickev_h(t3, t2);
+    __m256i bd_hi = __lasx_xvpickod_h(t3, t2);
+
+    __m256i a_pre = __lasx_xvpickev_h(ac_hi, ac_lo);
+    __m256i c_pre = __lasx_xvpickod_h(ac_hi, ac_lo);
+    __m256i b_pre = __lasx_xvpickev_h(bd_hi, bd_lo);
+    __m256i d_pre = __lasx_xvpickod_h(bd_hi, bd_lo);
+
+    a.val = __lasx_xvperm_w(a_pre, sh);
+    b.val = __lasx_xvperm_w(b_pre, sh);
+    c.val = __lasx_xvperm_w(c_pre, sh);
+    d.val = __lasx_xvperm_w(d_pre, sh);
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b, v_uint32x8& c, v_uint32x8& d )
+{
+    __m256i p0 = __lasx_xvld(ptr, 0);
+    __m256i p1 = __lasx_xvld(ptr, 32);
+    __m256i p2 = __lasx_xvld(ptr, 64);
+    __m256i p3 = __lasx_xvld(ptr, 96);
+
+    __m256i p01l = __lasx_xvilvl_w(p1, p0);
+    __m256i p01h = __lasx_xvilvh_w(p1, p0);
+    __m256i p23l = __lasx_xvilvl_w(p3, p2);
+    __m256i p23h = __lasx_xvilvh_w(p3, p2);
+
+    __m256i pll = __lasx_xvpermi_q(p01l, p23l, 0x02);
+    __m256i plh = __lasx_xvpermi_q(p01l, p23l, 0x13);
+    __m256i phl = __lasx_xvpermi_q(p01h, p23h, 0x02);
+    __m256i phh = __lasx_xvpermi_q(p01h, p23h, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_w(plh, pll);
+    __m256i g0 = __lasx_xvilvh_w(plh, pll);
+    __m256i r0 = __lasx_xvilvl_w(phh, phl);
+    __m256i a0 = __lasx_xvilvh_w(phh, phl);
+
+    a = v_uint32x8(b0);
+    b = v_uint32x8(g0);
+    c = v_uint32x8(r0);
+    d = v_uint32x8(a0);
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b, v_uint64x4& c, v_uint64x4& d )
+{
+    __m256i bgra0 = __lasx_xvld(ptr, 0);
+    __m256i bgra1 = __lasx_xvld(ptr, 32);
+    __m256i bgra2 = __lasx_xvld(ptr, 64);
+    __m256i bgra3 = __lasx_xvld(ptr, 96);
+
+    __m256i l02 = __lasx_xvpermi_q(bgra0, bgra2, 0x02);
+    __m256i h02 = __lasx_xvpermi_q(bgra0, bgra2, 0x13);
+    __m256i l13 = __lasx_xvpermi_q(bgra1, bgra3, 0x02);
+    __m256i h13 = __lasx_xvpermi_q(bgra1, bgra3, 0x13);
+
+    __m256i b0 = __lasx_xvilvl_d(l13, l02);
+    __m256i g0 = __lasx_xvilvh_d(l13, l02);
+    __m256i r0 = __lasx_xvilvl_d(h13, h02);
+    __m256i a0 = __lasx_xvilvh_d(h13, h02);
+
+    a = v_uint64x4(b0);
+    b = v_uint64x4(g0);
+    c = v_uint64x4(r0);
+    d = v_uint64x4(a0);
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_b(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_b(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 32*1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_h(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_h(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 16*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_w(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_w(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 8*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i xy_l = __lasx_xvilvl_d(y.val, x.val);
+    __m256i xy_h = __lasx_xvilvh_d(y.val, x.val);
+
+    __m256i xy0 = __lasx_xvpermi_q(xy_h, xy_l, 0 + 2*16);
+    __m256i xy1 = __lasx_xvpermi_q(xy_h, xy_l, 1 + 3*16);
+
+    __lasx_xvst(xy0, (__m256i*)ptr, 0);
+    __lasx_xvst(xy1, (__m256i*)ptr, 4*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b, const v_uint8x32& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
+            0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m256i sh_g = _v256_setr_b(
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
+            5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m256i sh_r = _v256_setr_b(
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
+            10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
+                                    0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
+                                    0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(p0, p2, 0 + 3*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p1, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 64);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b, const v_uint16x16& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    const __m256i sh_b = _v256_setr_b(
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
+         0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m256i sh_g = _v256_setr_b(
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
+         10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m256i sh_r = _v256_setr_b(
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
+         4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+
+    __m256i b0 = __lasx_xvshuf_b(a.val, a.val, sh_b);
+    __m256i g0 = __lasx_xvshuf_b(b.val, b.val, sh_g);
+    __m256i r0 = __lasx_xvshuf_b(c.val, c.val, sh_r);
+
+    const __m256i m0 = _v256_setr_b(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
+                                    0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
+    const __m256i m1 = _v256_setr_b(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
+                                    -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, m0), r0, m1);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, m0), b0, m1);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, m0), g0, m1);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p2, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p2, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p1,   (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 32*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b, const v_uint32x8& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i b0 = __lasx_xvshuf4i_w(a.val, 0x6c);
+    __m256i g0 = __lasx_xvshuf4i_w(b.val, 0xb1);
+    __m256i r0 = __lasx_xvshuf4i_w(c.val, 0xc6);
+
+    __m256i bitmask_1 = _v256_set_w(-1, 0, 0, -1, 0, 0, -1, 0);
+    __m256i bitmask_2 = _v256_set_w(0, 0, -1, 0, 0, -1, 0, 0);
+
+    __m256i p0 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(b0, g0, bitmask_1), r0, bitmask_2);
+    __m256i p1 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(g0, r0, bitmask_1), b0, bitmask_2);
+    __m256i p2 = __lasx_xvbitsel_v(__lasx_xvbitsel_v(r0, b0, bitmask_1), g0, bitmask_2);
+
+    __m256i bgr0 = __lasx_xvpermi_q(p1, p0, 0 + 2*16);
+    __m256i bgr2 = __lasx_xvpermi_q(p1, p0, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(p2,   (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 16*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b, const v_uint64x4& c,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i s01 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i s12 = __lasx_xvilvh_d(c.val, b.val);
+    __m256i s20 = __lasx_xvpermi_w(a.val, c.val, 0xe4);
+
+    __m256i bgr0 = __lasx_xvpermi_q(s20, s01, 0 + 2*16);
+    __m256i bgr1 = __lasx_xvpermi_q(s01, s12, 0x30);
+    __m256i bgr2 = __lasx_xvpermi_q(s12, s20, 1 + 3*16);
+
+    __lasx_xvst(bgr0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgr1, (__m256i*)ptr, 4*8);
+    __lasx_xvst(bgr2, (__m256i*)ptr, 8*8);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x32& a, const v_uint8x32& b,
+                                const v_uint8x32& c, const v_uint8x32& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_b(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_b(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_b(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_b(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_h(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_h(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_h(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_h(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 32);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 64);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 96);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x16& a, const v_uint16x16& b,
+                                const v_uint16x16& c, const v_uint16x16& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_h(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_h(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_h(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_h(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_w(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_w(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_w(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_w(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 16*2);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 32*2);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 48*2);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x8& a, const v_uint32x8& b,
+                                const v_uint32x8& c, const v_uint32x8& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_w(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_w(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_w(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_w(d.val, c.val);
+
+    __m256i bgra0_ = __lasx_xvilvl_d(ra0, bg0);
+    __m256i bgra1_ = __lasx_xvilvh_d(ra0, bg0);
+    __m256i bgra2_ = __lasx_xvilvl_d(ra1, bg1);
+    __m256i bgra3_ = __lasx_xvilvh_d(ra1, bg1);
+
+    __m256i bgra0 = __lasx_xvpermi_q(bgra1_, bgra0_, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(bgra1_, bgra0_, 1 + 3*16);
+    __m256i bgra1 = __lasx_xvpermi_q(bgra3_, bgra2_, 0 + 2*16);
+    __m256i bgra3 = __lasx_xvpermi_q(bgra3_, bgra2_, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)ptr, 8*4);
+    __lasx_xvst(bgra2, (__m256i*)ptr, 16*4);
+    __lasx_xvst(bgra3, (__m256i*)ptr, 24*4);
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x4& a, const v_uint64x4& b,
+                                const v_uint64x4& c, const v_uint64x4& d,
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED )
+{
+    __m256i bg0 = __lasx_xvilvl_d(b.val, a.val);
+    __m256i bg1 = __lasx_xvilvh_d(b.val, a.val);
+    __m256i ra0 = __lasx_xvilvl_d(d.val, c.val);
+    __m256i ra1 = __lasx_xvilvh_d(d.val, c.val);
+
+    __m256i bgra0 = __lasx_xvpermi_q(ra0, bg0, 0 + 2*16);
+    __m256i bgra1 = __lasx_xvpermi_q(ra1, bg1, 0 + 2*16);
+    __m256i bgra2 = __lasx_xvpermi_q(ra0, bg0, 1 + 3*16);
+    __m256i bgra3 = __lasx_xvpermi_q(ra1, bg1, 1 + 3*16);
+
+    __lasx_xvst(bgra0, (__m256i*)ptr, 0);
+    __lasx_xvst(bgra1, (__m256i*)(ptr), 4*8);
+    __lasx_xvst(bgra2, (__m256i*)(ptr), 8*8);
+    __lasx_xvst(bgra3, (__m256i*)(ptr), 12*8);
+}
+
+
+#define OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1/*, mode*/);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1/*, mode*/);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1/*, mode*/); \
+}
+
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
+OPENCV_HAL_IMPL_LASX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x8 v256_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    //1-load128, 2-permi, 3-cvt
+   return v_float32x8(__lasx_xvfcvtl_s_h(__lasx_xvpermi_d(__lsx_vld((const __m128i*)ptr, 0), 0x10)));
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    for (int i = 0; i < 8; i++)
+        buf[i] = (float)ptr[i];
+    return v256_load_aligned(buf);
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x8& a)
+{
+#if CV_FP16
+    __m256i ah = __lasx_xvfcvt_h_s(a.val, a.val);
+    __lsx_vst((_m128i)ah, ptr, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[8];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 8; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v256_cleanup() {}
+
+#include "intrin_math.hpp"
+inline v_float32x8 v_exp(const v_float32x8& x) { return v_exp_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_log(const v_float32x8& x) { return v_log_default_32f<v_float32x8, v_int32x8>(x); }
+inline void v_sincos(const v_float32x8& x, v_float32x8& s, v_float32x8& c) { v_sincos_default_32f<v_float32x8, v_int32x8>(x, s, c); }
+inline v_float32x8 v_sin(const v_float32x8& x) { return v_sin_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_cos(const v_float32x8& x) { return v_cos_default_32f<v_float32x8, v_int32x8>(x); }
+inline v_float32x8 v_erf(const v_float32x8& x) { return v_erf_default_32f<v_float32x8, v_int32x8>(x); }
+
+inline v_float64x4 v_exp(const v_float64x4& x) { return v_exp_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_log(const v_float64x4& x) { return v_log_default_64f<v_float64x4, v_int64x4>(x); }
+inline void v_sincos(const v_float64x4& x, v_float64x4& s, v_float64x4& c) { v_sincos_default_64f<v_float64x4, v_int64x4>(x, s, c); }
+inline v_float64x4 v_sin(const v_float64x4& x) { return v_sin_default_64f<v_float64x4, v_int64x4>(x); }
+inline v_float64x4 v_cos(const v_float64x4& x) { return v_cos_default_64f<v_float64x4, v_int64x4>(x); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LASX_HPP
diff --git a/3rdParty/opencv2/core/hal/intrin_lsx.hpp b/3rdParty/opencv2/core/hal/intrin_lsx.hpp
new file mode 100644
index 0000000000..a2f23d6abe
--- /dev/null
+++ b/3rdParty/opencv2/core/hal/intrin_lsx.hpp
@@ -0,0 +1,2546 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_LSX_HPP
+#define OPENCV_HAL_INTRIN_LSX_HPP
+
+#include <lsxintrin.h>
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+/////////// Utils ////////
+
+inline __m128i _v128_setr_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v0, v1, v2, v3, v4, v5, v6, v7,
+                           v8, v9, v10, v11, v12, v13, v14, v15 };
+}
+
+inline __m128i _v128_set_b(char v0, char v1, char v2, char v3, char v4, char v5, char v6,
+        char v7, char v8, char v9, char v10, char v11, char v12, char v13, char v14, char v15)
+{
+    return (__m128i)v16i8{ v15, v14, v13, v12, v11, v10, v9, v8,
+                           v7, v6, v5, v4, v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setr_h(short v0, short v1, short v2, short v3, short v4, short v5,
+       short v6, short v7)
+{
+    return (__m128i)v8i16{ v0, v1, v2, v3, v4, v5, v6, v7 };
+}
+
+inline __m128i _v128_setr_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v0, v1, v2, v3 };
+}
+
+inline __m128i _v128_set_w(int v0, int v1, int v2, int v3)
+{
+    return (__m128i)v4i32{ v3, v2, v1, v0 };
+}
+
+inline __m128i _v128_setall_w(int v0)
+{
+    return __lsx_vreplgr2vr_w(v0);
+}
+
+inline __m128i _v128_setr_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v0, v1 };
+}
+
+inline __m128i _v128_set_d(int64 v0, int64 v1)
+{
+    return (__m128i)v2i64{ v1, v0 };
+}
+
+inline __m128 _v128_setr_ps(float v0, float v1, float v2, float v3)
+{
+    return (__m128)v4f32{ v0, v1, v2, v3 };
+}
+
+inline __m128 _v128_setall_ps(float v0)
+{
+    return (__m128)v4f32{ v0, v0, v0, v0 };
+}
+
+inline __m128d _v128_setr_pd(double v0, double v1)
+{
+    return (__m128d)v2f64{ v0, v1 };
+}
+
+inline __m128d _v128_setall_pd(double v0)
+{
+    return (__m128d)v2f64{ v0, v0 };
+}
+
+inline __m128i _lsx_packus_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_bu_h(b, a, 0);
+}
+
+inline __m128i _lsx_packs_h(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_b_h(b, a, 0);
+}
+
+inline __m128i _lsx_packus_w(const __m128i& a, const __m128i& b)
+{
+    return __lsx_vssrarni_hu_w(b, a, 0);
+}
+
+/////// Types ///////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16};
+
+    v_uint8x16() {}
+    explicit v_uint8x16(__m128i v): val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+             uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)__lsx_vpickve2gr_bu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _v128_setr_b(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+
+    schar get0() const
+    {
+        return (schar)__lsx_vpickve2gr_b(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)__lsx_vpickve2gr_hu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _v128_setr_h(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    short get0() const
+    {
+        return (short)__lsx_vpickve2gr_h(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)__lsx_vpickve2gr_wu(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _v128_setr_w(v0, v1, v2, v3);
+    }
+
+    int get0() const
+    {
+        return (int)__lsx_vpickve2gr_w(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4};
+
+    v_float32x4() {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    explicit v_float32x4(__m128i v) { val = *((__m128*)&v); }
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _v128_setr_ps(v0, v1, v2, v3);
+    }
+
+    float get0() const
+    {
+        union { int iv; float fv; } d;
+        d.iv = __lsx_vpickve2gr_w(val, 0);
+        return d.fv;
+    }
+
+    int get0toint() const
+    {
+        __m128i result = __lsx_vftintrz_w_s(val);
+        return (int)__lsx_vpickve2gr_w(result, 0);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2};
+
+    v_uint64x2() {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_du(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2};
+
+    v_int64x2() {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _v128_setr_d(v0, v1);
+    }
+
+    uint64 get0() const
+    {
+        return __lsx_vpickve2gr_d(val, 0);
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2};
+
+    v_float64x2() {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    explicit v_float64x2(__m128i v) { val = *((__m128d*)&v); }
+    v_float64x2(double v0, double v1)
+    {
+        val = _v128_setr_pd(v0, v1);
+    }
+
+    double get0() const
+    {
+        union { int64 iv; double fv; } d;
+        d.iv = __lsx_vpickve2gr_d(val, 0);
+        return d.fv;
+    }
+
+    int64 get0toint64() const
+    {
+        __m128i result = __lsx_vftintrz_l_d(val);
+        return (int64)__lsx_vpickve2gr_d(result, 0);
+    }
+
+    __m128d val;
+};
+
+////////////// Load and store operations /////////
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE(_Tpvec, _Tp)                     \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(__lsx_vld(ptr, 0)); }                              \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec(__lsx_vldrepl_d(ptr, 0)); }                        \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec(__lsx_vilvl_d(vh, vl));                          \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    { __lsx_vst(a.val, ptr, 0); }                                      \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if ( mode == hal::STORE_UNALIGNED)                             \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else if ( mode == hal::STORE_ALIGNED_NOCACHE)                  \
+            __lsx_vst(a.val, ptr, 0);                                  \
+        else                                                           \
+            __lsx_vst(a.val, ptr, 0);                                  \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 0); }                             \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d(a.val, ptr, 0, 1); }                             \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint8x16,  uchar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int8x16,   schar)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int16x8,  short)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint32x4,  unsigned)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int32x4,   int)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_uint64x2,  uint64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE(v_int64x2,   int64)
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(_Tpvec, _Tp, halfreg)        \
+    inline _Tpvec v_load(const _Tp* ptr)                               \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec((halfreg)__lsx_vld(ptr, 0)); }                     \
+    inline _Tpvec v_load_low(const _Tp* ptr)                           \
+    { return _Tpvec((halfreg)__lsx_vldrepl_d(ptr, 0)); }               \
+    inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                  \
+        __m128i vl = __lsx_vldrepl_d(ptr0, 0);                         \
+        __m128i vh = __lsx_vldrepl_d(ptr1, 0);                         \
+        return _Tpvec((halfreg)__lsx_vilvl_d(vh, vl));                 \
+    }                                                                  \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)             \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)     \
+    {  __lsx_vst((__m128i)a.val, ptr, 0); }                            \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode)\
+    {                                                                  \
+        if( mode == hal::STORE_UNALIGNED)                              \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE)                   \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+        else                                                           \
+            __lsx_vst((__m128i)a.val, ptr, 0);                         \
+    }                                                                  \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                 \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 0); }                    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                \
+    {  __lsx_vstelm_d((__m128i)a.val, ptr, 0, 1); }                    \
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float32x4, float, __m128)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_FLT(v_float64x2, double, __m128d)
+
+inline __m128i _lsx_128_castps_si128(const __m128& v)
+{ return __m128i(v); }
+
+inline __m128i _lsx_128_castpd_si128(const __m128d& v)
+{ return __m128i(v); }
+
+#define OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, _Tpvecf, suffix, cast)  \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)    \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)           \
+    inline _Tpvec v_setzero_##suffix()                                            \
+    { return _Tpvec(__lsx_vldi(0)); }                                             \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                        \
+    { return _Tpvec(__lsx_vreplgr2vr_##ssuffix((ctype_s)v)); }                    \
+    template <> inline _Tpvec v_setzero_()                                        \
+    { return v_setzero_##suffix(); }                                              \
+    template <> inline _Tpvec v_setall_(_Tp v)                                    \
+    { return v_setall_##suffix(v); }                                              \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,  suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,   suffix, OPENCV_HAL_NOP)         \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float32x4, suffix, _lsx_128_castps_si128)  \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_float64x2, suffix, _lsx_128_castpd_si128)  \
+
+OPENCV_HAL_IMPL_LSX_INIT(v_uint8x16,  uchar,    u8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int8x16,   schar,    s8,   b,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint16x8,  ushort,   u16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int16x8,   short,    s16,  h,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint32x4,  unsigned, u32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int32x4,   int,      s32,  w,  int)
+OPENCV_HAL_IMPL_LSX_INIT(v_uint64x2,  uint64,   u64,  d,  long int)
+OPENCV_HAL_IMPL_LSX_INIT(v_int64x2,   int64,    s64,  d,  long int)
+
+inline __m128 _lsx_128_castsi128_ps(const __m128i &v)
+{ return __m128(v); }
+
+inline __m128d _lsx_128_castsi128_pd(const __m128i &v)
+{ return __m128d(v); }
+
+#define OPENCV_HAL_IMPL_LSX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast)    \
+    inline _Tpvec v_setzero_##suffix()                                      \
+    { return _Tpvec(__lsx_vldi(0)); }                                       \
+    inline _Tpvec v_setall_##suffix(_Tp v)                                  \
+    { return _Tpvec(_v128_setall_##zsuffix(v)); }                           \
+    template <> inline _Tpvec v_setzero_()                                  \
+    { return v_setzero_##suffix(); }                                        \
+    template <> inline _Tpvec v_setall_(_Tp v)                              \
+    { return v_setall_##suffix(v); }                                        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint8x16,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int8x16,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint16x8,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int16x8,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint32x4,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int32x4,      suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_uint64x2,     suffix,   cast)        \
+    OPENCV_HAL_IMPL_LSX_CAST(_Tpvec, v_int64x2,      suffix,   cast)        \
+
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float32x4, float,  f32, ps, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_INIT_FLT(v_float64x2, double, f64, pd, _lsx_128_castsi128_pd)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a)
+{ return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a)
+{ return v_float32x4(_lsx_128_castps_si128(__m128(a.val))); }
+
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a)
+{ return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a)
+{ return v_float64x2(_lsx_128_castpd_si128(__m128d(a.val))); }
+
+//////////////// Variant Value reordering ///////////////
+
+// unpacks
+#define OPENCV_HAL_IMPL_LSX_UNPACK(_Tpvec, suffix)                            \
+    inline _Tpvec v128_unpacklo(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvl_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+    inline _Tpvec v128_unpackhi(const _Tpvec& a, const _Tpvec& b)             \
+    { return _Tpvec(__lsx_vilvh_##suffix(__m128i(b.val), __m128i(a.val))); }  \
+
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint8x16,  b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int8x16,   b)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint16x8,  h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int16x8,   h)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint32x4,  w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int32x4,   w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_uint64x2,  d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_int64x2,   d)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float32x4, w)
+OPENCV_HAL_IMPL_LSX_UNPACK(v_float64x2, d)
+
+//ZIP
+#define OPENCV_HAL_IMPL_LSX_ZIP(_Tpvec)                               \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)     \
+    { return (_Tpvec)__lsx_vilvl_d((__m128i)b.val, (__m128i)a.val); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)    \
+    { return (_Tpvec)__lsx_vilvh_d((__m128i)b.val, (__m128i)a.val); } \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,         \
+                            _Tpvec& c, _Tpvec& d)                     \
+    {                                                                 \
+        __m128i a1 = (__m128i)a.val,  b1 = (__m128i)b.val;            \
+        c = _Tpvec(__lsx_vilvl_d(b1, a1));                            \
+        d = _Tpvec(__lsx_vilvh_d(b1, a1));                            \
+    }                                                                 \
+    inline void v_zip(const _Tpvec& a, const _Tpvec& b,               \
+                      _Tpvec& ab0, _Tpvec& ab1)                       \
+    {                                                                 \
+        ab0 = v128_unpacklo(a, b);                                    \
+        ab1 = v128_unpackhi(a, b);                                    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int8x16)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int16x8)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_int64x2)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float32x4)
+OPENCV_HAL_IMPL_LSX_ZIP(v_float64x2)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/** Arithmetics **/
+#define OPENCV_HAL_IMPL_LSX_BIN_OP(bin_op, _Tpvec, intrin)           \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)  \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint8x16,  __lsx_vsadd_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint8x16,  __lsx_vssub_bu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int8x16,   __lsx_vsadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int8x16,   __lsx_vssub_b)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint16x8,  __lsx_vsadd_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint16x8,  __lsx_vssub_hu)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int16x8,   __lsx_vsadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int16x8,   __lsx_vssub_h)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint32x4,  __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint32x4,  __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_uint32x4,  __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int32x4,   __lsx_vadd_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int32x4,   __lsx_vsub_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_int32x4,   __lsx_vmul_w)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_uint64x2,  __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_uint64x2,  __lsx_vsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_int64x2,   __lsx_vadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_int64x2,   __lsx_vsub_d)
+
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float32x4, __lsx_vfadd_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float32x4, __lsx_vfsub_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float32x4, __lsx_vfmul_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float32x4, __lsx_vfdiv_s)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_add, v_float64x2, __lsx_vfadd_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_sub, v_float64x2, __lsx_vfsub_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_mul, v_float64x2, __lsx_vfmul_d)
+OPENCV_HAL_IMPL_LSX_BIN_OP(v_div, v_float64x2, __lsx_vfdiv_d)
+
+// saturating multiply 8-bit, 16-bit
+inline v_uint8x16 v_mul(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x16 v_mul(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int16x8 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x8 v_mul(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_hu(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_uint16x8)__lsx_vssrlrni_hu_w(ph, pl, 0);
+}
+inline v_int16x8 v_mul(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i pev = __lsx_vmulwev_w_h(a0, b0);
+    __m128i pod = __lsx_vmulwod_w_h(a0, b0);
+    __m128i pl  = __lsx_vilvl_w(pod, pev);
+    __m128i ph  = __lsx_vilvh_w(pod, pev);
+    return (v_int16x8)__lsx_vssrarni_h_w(ph, pl, 0);
+}
+
+/** Non-saturating arithmetics **/
+
+#define OPENCV_HAL_IMPL_LSX_BIN_FUNC(func, _Tpvec, intrin)         \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)           \
+    { return _Tpvec(intrin(a.val, b.val)); }                       \
+
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint8x16,  __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int8x16,   __lsx_vadd_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_uint16x8,  __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_add_wrap, v_int16x8,   __lsx_vadd_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint8x16,  __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int8x16,   __lsx_vsub_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_uint16x8,  __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_sub_wrap, v_int16x8,   __lsx_vsub_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_uint16x8,  __lsx_vmul_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_mul_wrap, v_int16x8,   __lsx_vmul_h)
+
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    return v_uint8x16(__lsx_vpackev_b(p1, p0));
+}
+
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+// Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_bu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_bu(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_h_b(a0, b0);
+    __m128i p1 = __lsx_vmulwod_h_b(a0, b0);
+    c.val = __lsx_vilvl_h(p1, p0);
+    d.val = __lsx_vilvh_h(p1, p0);
+}
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_h(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_h(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_w_hu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_w_hu(a0, b0);
+    c.val = __lsx_vilvl_w(p1, p0);
+    d.val = __lsx_vilvh_w(p1, p0);
+}
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i a0 = a.val, b0 = b.val;
+    __m128i p0 = __lsx_vmulwev_d_wu(a0, b0);
+    __m128i p1 = __lsx_vmulwod_d_wu(a0, b0);
+    c.val = __lsx_vilvl_d(p1, p0);
+    d.val = __lsx_vilvh_d(p1, p0);
+}
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{ return v_int16x8(__lsx_vmuh_h(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint16x8(__lsx_vmuh_hu(a.val, b.val)); }
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_LSX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)                 \
+    inline _Tpuvec v_shl(const _Tpuvec& a, int imm)                                  \
+    { return _Tpuvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec v_shl(const _Tpsvec& a, int imm)                                  \
+    { return _Tpsvec(__lsx_vsll_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpuvec v_shr(const _Tpuvec& a, int imm)                                  \
+    { return _Tpuvec(__lsx_vsrl_##suffix(a.val, __lsx_vreplgr2vr_##suffix(imm))); }  \
+    inline _Tpsvec v_shr(const _Tpsvec& a, int imm)                                  \
+    { return _Tpsvec(srai(a.val, __lsx_vreplgr2vr_##suffix(imm))); }                 \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vslli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                                           \
+    { return _Tpuvec(__lsx_vsrli_##suffix(a.val, imm)); }                            \
+    template<int imm>                                                                \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                                           \
+    { return _Tpsvec(__lsx_vsrai_##suffix(a.val, imm)); }                            \
+
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint16x8, v_int16x8, h, __lsx_vsra_h)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint32x4, v_int32x4, w, __lsx_vsra_w)
+OPENCV_HAL_IMPL_LSX_SHIFT_OP(v_uint64x2, v_int64x2, d, __lsx_vsra_d)
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_LSX_LOGIC_OP(_Tpvec, suffix)                                 \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_and, _Tpvec, __lsx_vand_##suffix)                   \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_or, _Tpvec, __lsx_vor_##suffix)                     \
+    OPENCV_HAL_IMPL_LSX_BIN_OP(v_xor, _Tpvec, __lsx_vxor_##suffix)                   \
+    inline _Tpvec v_not(const _Tpvec& a)                                             \
+    { return _Tpvec(__lsx_vnori_b(a.val, 0)); }                                      \
+
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint8x16,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int8x16,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint16x8,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int16x8,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint32x4,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int32x4,    v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_uint64x2,   v)
+OPENCV_HAL_IMPL_LSX_LOGIC_OP(v_int64x2,    v)
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(bin_op, _Tpvec, intrin, cast)               \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                           \
+    { return _Tpvec(intrin((__m128i)(a.val), (__m128i)(b.val))); }
+
+#define OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(_Tpvec, cast)                             \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_and, _Tpvec, __lsx_vand_v, cast)              \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_or, _Tpvec, __lsx_vor_v, cast)                \
+    OPENCV_HAL_IMPL_LSX_FLOAT_BIN_OP(v_xor, _Tpvec, __lsx_vxor_v, cast)              \
+    inline _Tpvec v_not(const _Tpvec& a)                                             \
+    { return _Tpvec(__lsx_vnori_b((__m128i)(a.val), 0)); }                           \
+
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_FLOAT_LOGIC_OP(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Select **/
+#define OPENCV_HAL_IMPL_LSX_SELECT(_Tpvec)                                           \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(__lsx_vbitsel_v(b.val, a.val, mask.val)); }                      \
+
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_SELECT(v_int32x4)
+
+inline v_float32x4 v_select(const v_float32x4 &mask, const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+inline v_float64x2 v_select(const v_float64x2 &mask, const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vbitsel_v((__m128i)b.val, (__m128i)a.val, (__m128i)mask.val)); }
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpvec)                            \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_eq(a, b)); }                                        \
+    inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_gt(b, a); }                                               \
+    inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_not(v_lt(a, b)); }                                        \
+    inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_ge(b, a); }                                               \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, usuffix)    \
+    inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b)                  \
+    { return _Tpuvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b)                  \
+    { return _Tpuvec(__lsx_vslt_##usuffix(b.val, a.val)); }                  \
+    inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b)                  \
+    { return _Tpsvec(__lsx_vseq_##suffix(a.val, b.val)); }                   \
+    inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b)                  \
+    { return _Tpsvec(__lsx_vslt_##suffix(b.val, a.val)); }                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpuvec)                                   \
+    OPENCV_HAL_IMPL_LSX_CMP_OP_OV(_Tpsvec)
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint8x16,  v_int8x16,  b, bu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint16x8,  v_int16x8,  h, hu)
+OPENCV_HAL_IMPL_LSX_CMP_OP_INT(v_uint32x4,  v_int32x4,  w, wu)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(_Tpvec, suffix)          \
+    inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)          \
+    { return _Tpvec(__lsx_vseq_##suffix(a.val, b.val)); }         \
+    inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)          \
+    { return v_not(v_eq(a, b)); }
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_uint64x2, d)
+OPENCV_HAL_IMPL_LSX_CMP_OP_64BIT(v_int64x2, d)
+
+#define OPENCV_HAL_IMPL_LSX_CMP_FLT(bin_op, suffix, _Tpvec, ssuffix)       \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b)                 \
+    { return _Tpvec(__lsx_##suffix##_##ssuffix(a.val, b.val)); }           \
+
+#define OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(_Tpvec, ssuffix)                    \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_eq, vfcmp_ceq, _Tpvec, ssuffix)          \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_ne, vfcmp_cne, _Tpvec, ssuffix)          \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_lt,  vfcmp_clt, _Tpvec, ssuffix)         \
+    OPENCV_HAL_IMPL_LSX_CMP_FLT(v_le, vfcmp_cle, _Tpvec, ssuffix)          \
+
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_CMP_OP_FLT(v_float64x2, d)
+
+inline v_float32x4 v_gt(const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_clt_s(b.val, a.val)); }
+
+inline v_float32x4 v_ge(const v_float32x4 &a, const v_float32x4 &b)
+{ return v_float32x4(__lsx_vfcmp_cle_s(b.val, a.val)); }
+
+inline v_float64x2 v_gt(const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_clt_d(b.val, a.val)); }
+
+inline v_float64x2 v_ge(const v_float64x2 &a, const v_float64x2 &b)
+{ return v_float64x2(__lsx_vfcmp_cle_d(b.val, a.val)); }
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(__lsx_vfcmp_cor_s(a.val, a.val)); }
+
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(__lsx_vfcmp_cor_d(a.val, a.val)); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint8x16,  __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint8x16,  __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int8x16,   __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int8x16,   __lsx_vmax_b)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint16x8,  __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint16x8,  __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int16x8,   __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int16x8,   __lsx_vmax_h)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_uint32x4,  __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_uint32x4,  __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_int32x4,   __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_int32x4,   __lsx_vmax_w)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float32x4, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float32x4, __lsx_vfmax_s)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_min, v_float64x2, __lsx_vfmin_d)
+OPENCV_HAL_IMPL_LSX_BIN_FUNC(v_max, v_float64x2, __lsx_vfmax_d)
+
+template <int imm,
+    bool is_invalid = ((imm < 0) || (imm > 16)),
+    bool is_first = (imm == 0),
+    bool is_half = (imm == 8),
+    bool is_second = (imm == 16),
+    bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+class v_lsx_palignr_u8_class;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, true, false, false, false, false>;
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, true, false, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(b);
+        return a;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, true, false, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        return __lsx_vshuf4i_d(a, b, 0x9);
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, true, false>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        CV_UNUSED(a);
+        return b;
+    }
+};
+
+template <int imm>
+class v_lsx_palignr_u8_class<imm, false, false, false, false, true>
+{
+public:
+    inline __m128i operator()(const __m128i& a, const __m128i& b) const
+    {
+        enum { imm2 = (sizeof(__m128i) - imm) };
+        return __lsx_vor_v(__lsx_vbsrl_v(a, imm), __lsx_vbsll_v(b, imm2));
+    }
+};
+
+template <int imm>
+inline __m128i v_lsx_palignr_u8(const __m128i& a, const __m128i& b)
+{
+    CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_lsx_palignr_u8");
+    return v_lsx_palignr_u8_class<imm>()(a, b);
+}
+/** Rotate **/
+#define OPENCV_HAL_IMPL_LSX_ROTATE_CAST(_Tpvec, cast)                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec &a)                                       \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsrl_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec &a)                                        \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        __m128i ret = __lsx_vbsll_v((__m128i)a.val, imm2);                              \
+        return _Tpvec(cast(ret));                                                       \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                      \
+    {                                                                                   \
+        enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type))};                      \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)a.val, (__m128i)b.val)));    \
+    }                                                                                   \
+    template<int imm>                                                                   \
+    inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                       \
+    {                                                                                   \
+        enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type))};   \
+        return _Tpvec(cast(v_lsx_palignr_u8<imm2>((__m128i)b.val, (__m128i)a.val)));    \
+    }
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint8x16, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int8x16,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint16x8, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int16x8,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint32x4, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int32x4,  OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_uint64x2, OPENCV_HAL_NOP)                             \
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_int64x2,  OPENCV_HAL_NOP)                             \
+
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float32x4, _lsx_128_castsi128_ps)
+OPENCV_HAL_IMPL_LSX_ROTATE_CAST(v_float64x2, _lsx_128_castsi128_pd)
+
+/** Rverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    __m128i vec = __lsx_vshuf4i_b(a.val, 0x1B);
+    return v_uint8x16(__lsx_vshuf4i_w(vec, 0x1B));
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    __m128i vec = __lsx_vshuf4i_h(a.val, 0x1B);
+    return v_uint16x8(__lsx_vshuf4i_w(vec, 0x4E));
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_int32x4(__lsx_vshuf4i_w(a.val, 0x1B)); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_int64x2(__lsx_vshuf4i_w(a.val, 0x4E)); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////////// Reduce and mask ////////////
+
+/** Reduce **/
+// this function is return a[0]+a[1]+...+a[31]
+inline unsigned v_reduce_sum(const v_uint8x16& a)
+{
+    __m128i t1 = __lsx_vhaddw_hu_bu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline int v_reduce_sum(const v_int8x16 &a)
+{
+    __m128i t1 = __lsx_vhaddw_h_b(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_w_h(t1, t1);
+    __m128i t3 = __lsx_vhaddw_d_w(t2, t2);
+    __m128i t4 = __lsx_vhaddw_q_d(t3, t3);
+    return (int)__lsx_vpickve2gr_w(t4, 0);
+}
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_16(_Tpvec, sctype, func, intrin)            \
+    inline sctype v_reduce_##func(const _Tpvec& a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 1));                              \
+        return (sctype)__lsx_vpickve2gr_b(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, min, __lsx_vmin_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_uint8x16, uchar, max, __lsx_vmax_bu)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, min, __lsx_vmin_b)
+OPENCV_HAL_IMPL_LSX_REDUCE_16(v_int8x16,  schar, max, __lsx_vmax_b)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_8(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        val = intrin(val, __lsx_vbsrl_v(val, 2));                              \
+        return (sctype)__lsx_vpickve2gr_h(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, min, __lsx_vmin_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_uint16x8, ushort, max, __lsx_vmax_hu)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  min, __lsx_vmin_h)
+OPENCV_HAL_IMPL_LSX_REDUCE_8(v_int16x8,  short,  max, __lsx_vmax_h)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_4(_Tpvec, sctype, func, intrin)             \
+    inline sctype v_reduce_##func(const _Tpvec &a)                             \
+    {                                                                          \
+        __m128i val = intrin(a.val, __lsx_vbsrl_v(a.val, 8));                  \
+        val = intrin(val, __lsx_vbsrl_v(val, 4));                              \
+        return (sctype)__lsx_vpickve2gr_w(val, 0);                             \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, min, __lsx_vmin_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_uint32x4, unsigned, max, __lsx_vmax_wu)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      min, __lsx_vmin_w)
+OPENCV_HAL_IMPL_LSX_REDUCE_4(v_int32x4,  int,      max, __lsx_vmax_w)
+
+#define OPENCV_HAL_IMPL_LSX_REDUCE_FLT(func, intrin)                           \
+    inline float v_reduce_##func(const v_float32x4 &a)                         \
+    {                                                                          \
+        __m128 val   = a.val;                                                  \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 8));             \
+        val = intrin(val, (__m128)__lsx_vbsrl_v((__m128i)val, 4));             \
+        float *fval = (float*)&val;                                            \
+        return fval[0];                                                        \
+    }
+
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(min, __lsx_vfmin_s)
+OPENCV_HAL_IMPL_LSX_REDUCE_FLT(max, __lsx_vfmax_s)
+
+inline int v_reduce_sum(const v_int32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_d_w(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_q_d(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint32x4 &a)
+{
+    __m128i t1 = __lsx_vhaddw_du_wu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (int)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline int v_reduce_sum(const v_int16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_w_h(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_d_w(t1, t1);
+    __m128i t3 = __lsx_vhaddw_q_d(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sum(const v_uint16x8 &a)
+{
+    __m128i t1 = __lsx_vhaddw_wu_hu(a.val, a.val);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (int)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline float v_reduce_sum(const v_float32x4 &a)
+{
+    __m128i val = (__m128i)a.val;
+    val = __lsx_vbsrl_v(val, 8);
+    __m128 result = __lsx_vfadd_s(a.val, (__m128)val);
+    float *pa = (float*)&result;
+    return (float)(pa[0] + pa[1]);
+}
+
+inline uint64 v_reduce_sum(const v_uint64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_qu_du(a.val, a.val);
+    return (uint64)__lsx_vpickve2gr_du(t0, 0);
+}
+
+inline int64 v_reduce_sum(const v_int64x2 &a)
+{
+    __m128i t0 = __lsx_vhaddw_q_d(a.val, a.val);
+    return (int64)__lsx_vpickve2gr_d(t0, 0);
+}
+
+inline double v_reduce_sum(const v_float64x2 &a)
+{
+    double *pa = (double*)&a;
+    return pa[0] + pa[1];
+}
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    __m128i a0 = (__m128i)a.val;
+    __m128i b0 = (__m128i)b.val;
+    __m128i c0 = (__m128i)c.val;
+    __m128i d0 = (__m128i)d.val;
+    __m128i ac_l = __lsx_vilvl_w(c0, a0);
+    __m128i ac_h = __lsx_vilvh_w(c0, a0);
+    __m128i bd_l = __lsx_vilvl_w(d0, b0);
+    __m128i bd_h = __lsx_vilvh_w(d0, b0);
+    __m128  ac   = __lsx_vfadd_s((__m128)ac_l, (__m128)ac_h);
+    __m128  bd   = __lsx_vfadd_s((__m128)bd_l, (__m128)bd_h);
+    return v_float32x4(__lsx_vfadd_s((__m128)__lsx_vilvl_w((__m128i)bd, (__m128i)ac),
+                       (__m128)__lsx_vilvh_w((__m128i)bd, (__m128i)ac)));
+}
+
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_b(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vabsd_bu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_hu_bu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_wu_hu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_du_wu(t2, t2);
+    __m128i t4 = __lsx_vhaddw_qu_du(t3, t3);
+    return (unsigned)__lsx_vpickve2gr_w(t4, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_hu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i t0 = __lsx_vabsd_h(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_wu_hu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_du_wu(t1, t1);
+    __m128i t3 = __lsx_vhaddw_qu_du(t2, t2);
+    return (unsigned)__lsx_vpickve2gr_w(t3, 0);
+}
+
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_wu(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i t0 = __lsx_vabsd_w(a.val, b.val);
+    __m128i t1 = __lsx_vhaddw_du_wu(t0, t0);
+    __m128i t2 = __lsx_vhaddw_qu_du(t1, t1);
+    return (unsigned)__lsx_vpickve2gr_w(t2, 0);
+}
+
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 a_b = v_sub(a, b);
+    return v_reduce_sum(v_float32x4((__m128i)a_b.val & __lsx_vreplgr2vr_w(0x7fffffff)));
+}
+
+/** Popcount **/
+#define OPENCV_HAL_IMPL_LSX_POPCOUNT(_Tpvec, _Tp, suffix)                  \
+inline _Tpvec v_popcount(const _Tp& a)                                     \
+{ return _Tpvec(__lsx_vpcnt_##suffix(a.val)); }
+
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_uint8x16,  b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint8x16,  v_int8x16,   b);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_uint16x8,  h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint16x8,  v_int16x8,   h);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_uint32x4,  w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint32x4,  v_int32x4,   w);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_uint64x2,  d);
+OPENCV_HAL_IMPL_LSX_POPCOUNT(v_uint64x2,  v_int64x2,   d);
+
+/** Mask **/
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt)              \
+inline tt reinterpret_int(ft x) { union {ft l; tt i;} v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+inline int v_signmask(const v_int8x16& a)
+{
+    __m128i result = __lsx_vmskltz_b(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint8x16& a)
+{ return v_signmask(v_reinterpret_as_s8(a)) ;}
+
+inline int v_signmask(const v_int16x8 &a)
+{
+    __m128i result = __lsx_vmskltz_h(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_uint16x8 &a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    __m128i result = __lsx_vmskltz_w(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    __m128i result = __lsx_vmskltz_d(a.val);
+    return __lsx_vpickve2gr_w(result, 0);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(*(v_int32x4*)(&a)); }
+
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(*(v_int64x2*)(&a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+/** Checks **/
+#define OPENCV_HAL_IMPL_LSX_CHECK(_Tpvec, allmask) \
+    inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
+    inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int8x16, 65535)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_int16x8, 255);
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_uint64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_int64x2, 3)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float32x4, 15)
+OPENCV_HAL_IMPL_LSX_CHECK(v_float64x2, 3)
+
+///////////// Other math /////////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_LSX_MULADD(_Tpvec, suffix)                              \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)      \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec &b, const _Tpvec& c)   \
+    { return _Tpvec(__lsx_vfmadd_##suffix(a.val, b.val, c.val)); }              \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                       \
+    { return _Tpvec(__lsx_vfsqrt_##suffix(x.val)); }                            \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)             \
+    { return v_fma(a, a, v_mul(b, b)); }                                        \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)                 \
+    { return v_sqrt(v_fma(a, a, v_mul(b, b))); }
+
+OPENCV_HAL_IMPL_LSX_MULADD(v_float32x4, s)
+OPENCV_HAL_IMPL_LSX_MULADD(v_float64x2, d)
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_int32x4(__lsx_vmadd_w(c.val, a.val, b.val)); }
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    return v_float32x4(__lsx_vfrsqrt_s(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    return v_float64x2(__lsx_vfrsqrt_d(x.val));
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_LSX_ABS(_Tpvec, suffix)                          \
+    inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)                        \
+    { return v_u##_Tpvec(__lsx_vabsd_##suffix(x.val, __lsx_vldi(0))); }
+
+OPENCV_HAL_IMPL_LSX_ABS(int8x16, b)
+OPENCV_HAL_IMPL_LSX_ABS(int16x8, h)
+OPENCV_HAL_IMPL_LSX_ABS(int32x4, w)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(*((__m128i*)&x) & __lsx_vreplgr2vr_w(0x7fffffff)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{ return v_float64x2(*((__m128i*)&x) & __lsx_vreplgr2vr_d(0x7fffffffffffffff)); }
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_bu(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_hu(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_wu(a.val, b.val); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{ return (v_uint8x16)__lsx_vabsd_b(a.val, b.val); }
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{ return (v_uint16x8)__lsx_vabsd_h(a.val, b.val); }
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{ return (v_uint32x4)__lsx_vabsd_w(a.val, b.val); }
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{ return v_abs(v_sub(a, b)); }
+
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{ return v_abs(v_sub(a, b)); }
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
+}
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_sub(v_max(a, b), v_min(a, b)); }
+
+///////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftint_w_s(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftint_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{ return v_int32x4(__lsx_vftint_w_d(b.val, a.val)); }
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(a.val)); }
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(__lsx_vftintrz_w_d(a.val, a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrm_s(a.val)))); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrm_d(a.val))); }
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{ return v_int32x4(__lsx_vftintrz_w_s(__m128(__lsx_vfrintrp_s(a.val)))); }
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{ return v_trunc(v_float64x2(__lsx_vfrintrp_d(a.val))); }
+
+/** To float **/
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{ return v_float32x4(__lsx_vffint_s_w(a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{ return v_float32x4(__lsx_vfcvt_s_d(a.val, a.val)); }
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{ return v_float32x4(__lsx_vfcvt_s_d(b.val, a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffintl_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{ return v_float64x2(__lsx_vffinth_d_w(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvtl_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{ return v_float64x2(__lsx_vfcvth_d_s(a.val)); }
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& v)
+{ return v_float64x2(__lsx_vffint_d_l(v.val)); }
+
+
+//////////////// Lookup table access ////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_b(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]], tab[idx[8]],
+                     tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]],
+                     tab[idx[14]], tab[idx[15]]));
+}
+
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_h(*(const short*)(tab + idx[0]), *(const short*)(tab + idx[1]),
+           *(const short*)(tab + idx[2]), *(const short*)(tab + idx[3]), *(const short*)(tab + idx[4]),
+           *(const short*)(tab + idx[5]), *(const short*)(tab + idx[6]), *(const short*)(tab + idx[7])));
+}
+
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx)
+{ return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_h(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_w(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]),
+                *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx)
+{ return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_w(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_int32x4 v_lut_pairs(const int *tab, const int* idx)
+{
+    return v_int32x4(_v128_setr_d(*(const int64_t*)(tab + idx[0]), *(const int64_t*)(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int *idx)
+{
+    return v_int64x2(_v128_setr_d(tab[idx[0]], tab[idx[1]]));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(_v128_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)_v128_setr_pd(*(const double*)(tab + idx[0]), *(const double*)(tab + idx[1])));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4((__m128)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(_v128_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2((__m128d)__lsx_vld(tab + idx[0], 0));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int *idx = (int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    const int *idx = (const int*)&idxvec.val;
+    return v_lut(tab, idx);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    const int *idx = (const int*)&idxvec.val;
+    __m128i xy0  = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1  = __lsx_vld(tab + idx[1], 0);
+    __m128i xy2  = __lsx_vld(tab + idx[2], 0);
+    __m128i xy3  = __lsx_vld(tab + idx[3], 0);
+    __m128i xy01 = __lsx_vilvl_d(xy1, xy0);
+    __m128i xy23 = __lsx_vilvl_d(xy3, xy2);
+    __m128i xxyy02 = __lsx_vilvl_w(xy23, xy01);
+    __m128i xxyy13 = __lsx_vilvh_w(xy23, xy01);
+    x = v_float32x4((__m128)__lsx_vilvl_w(xxyy13, xxyy02));
+    y = v_float32x4((__m128)__lsx_vilvh_w(xxyy13, xxyy02));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    const int* idx = (const int*)&idxvec.val;
+    __m128i xy0 = __lsx_vld(tab + idx[0], 0);
+    __m128i xy1 = __lsx_vld(tab + idx[1], 0);
+    x = v_float64x2((__m128d)__lsx_vilvl_d(xy1, xy0));
+    y = v_float64x2((__m128d)__lsx_vilvh_d(xy1, xy0));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0705060403010200, 0x0f0d0e0c0b090a08)));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0703060205010400, 0x0f0b0e0a0d090c08)));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0706030205040100, 0x0f0e0b0a0d0c0908)));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(__lsx_vshuf_b(vec.val, vec.val,
+                _v128_setr_d(0x0b0a030209080100, 0x0f0e07060d0c0504)));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(__lsx_vshuf4i_w(vec.val, 0xd8));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec)
+{ return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{ return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int8x16(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x1211100f0e0d0c0a, 0x0908060504020100)));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    __m128i zero = __lsx_vldi(0);
+    return v_int16x8(__lsx_vshuf_b(zero, vec.val,
+           _v128_set_d(0x11100f0e0d0c0b0a, 0x0908050403020100)));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
+{ return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+//////////// Matrix operations /////////
+
+/////////// Dot Product /////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int32x4(__lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y));
+}
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_w_h(z, x, y);
+    return v_int32x4(__lsx_vmaddwod_w_h(t, x, y));
+}
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i x = a.val, y = b.val;
+    return v_int64x2(__lsx_vmaddwod_d_w(__lsx_vmulwev_d_w(x, y), x, y));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    __m128i x = a.val, y = b.val, z = c.val;
+    __m128i t = __lsx_vmaddwev_d_w(z, x, y);
+    return v_int64x2(__lsx_vmaddwod_d_w(t, x, y));
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_bu(x, y);
+    __m128i odd   = __lsx_vmulwod_h_bu(x, y);
+    __m128i prod0 = __lsx_vhaddw_wu_hu(even, even);
+    __m128i prod1 = __lsx_vhaddw_wu_hu(odd, odd);
+    return v_uint32x4(__lsx_vadd_w(prod0, prod1));
+}
+
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_add(v_dotprod_expand(a, b), c) ;}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_h_b(x, y);
+    __m128i odd   = __lsx_vmulwod_h_b(x, y);
+    __m128i prod0 = __lsx_vhaddw_w_h(even, even);
+    __m128i prod1 = __lsx_vhaddw_w_h(odd, odd);
+    return v_int32x4(__lsx_vadd_w(prod0, prod1));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_h(x, y);
+    __m128i odd   = __lsx_vmulwod_w_h(x, y);
+    __m128i prod0 = __lsx_vhaddw_d_w(even, even);
+    __m128i prod1 = __lsx_vhaddw_d_w(odd, odd);
+    return v_int64x2(__lsx_vadd_d(prod0, prod1));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+//32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_add(v_dotprod_expand(a, b), c); }
+
+
+///////// Fast Dot Product //////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i even  = __lsx_vmulwev_w_hu(x, y);
+    __m128i odd   = __lsx_vmulwod_w_hu(x, y);
+    __m128i prod0 = __lsx_vhaddw_du_wu(even, even);
+    __m128i prod1 = __lsx_vhaddw_du_wu(odd, odd);
+    return v_uint64x2(__lsx_vilvl_d(__lsx_vhaddw_qu_du(prod0, prod0), __lsx_vhaddw_qu_du(prod1, prod1)));
+}
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i x = a.val, y = b.val;
+    __m128i prod = __lsx_vmaddwod_w_h(__lsx_vmulwev_w_h(x, y), x, y);
+    __m128i sign = __lsx_vsrai_w(prod, 31);
+    __m128i lo   = __lsx_vilvl_w(sign, prod);
+    __m128i hi   = __lsx_vilvh_w(sign, prod);
+    return v_int64x2(__lsx_vadd_d(lo, hi));
+}
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val);
+    __m128 v3 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0xFF), m3.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), __lsx_vfadd_s(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const  v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& a)
+{
+    __m128i x = (__m128i)v.val;
+    __m128 v0 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x0), m0.val);
+    __m128 v1 = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(x, 0x55), m1.val);
+    __m128 v2 = __lsx_vfmadd_s((__m128)__lsx_vshuf4i_w(x, 0xAA), m2.val, a.val);
+
+    return v_float32x4(__lsx_vfadd_s(__lsx_vfadd_s(v0, v1), v2));
+}
+
+#define OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(_Tpvec, cast_from, cast_to)                          \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                            \
+                               const _Tpvec& a2, const _Tpvec& a3,                            \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)                \
+   {                                                                                          \
+       __m128i t0 = cast_from(__lsx_vilvl_w(a1.val, a0.val));                                 \
+       __m128i t1 = cast_from(__lsx_vilvl_w(a3.val, a2.val));                                 \
+       __m128i t2 = cast_from(__lsx_vilvh_w(a1.val, a0.val));                                 \
+       __m128i t3 = cast_from(__lsx_vilvh_w(a3.val, a2.val));                                 \
+       b0.val = cast_to(__lsx_vilvl_d(t1, t0));                                               \
+       b1.val = cast_to(__lsx_vilvh_d(t1, t0));                                               \
+       b2.val = cast_to(__lsx_vilvl_d(t3, t2));                                               \
+       b3.val = cast_to(__lsx_vilvh_d(t3, t2));                                               \
+   }
+
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_uint32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_LSX_TRANSPOSE4X4(v_int32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+
+inline void v_transpose4x4(const v_float32x4& a0, const v_float32x4& a1,
+                           const v_float32x4& a2, const v_float32x4& a3,
+                           v_float32x4& b0, v_float32x4& b1, v_float32x4& b2, v_float32x4& b3)
+{
+    __m128i vec0 = (__m128i)a0.val, vec1 = (__m128i)a1.val;
+    __m128i vec2 = (__m128i)a2.val, vec3 = (__m128i)a3.val;
+    __m128i t0 = __lsx_vilvl_w(vec1, vec0);
+    __m128i t1 = __lsx_vilvl_w(vec3, vec2);
+    __m128i t2 = __lsx_vilvh_w(vec1, vec0);
+    __m128i t3 = __lsx_vilvh_w(vec3, vec2);
+    b0.val = __m128(__lsx_vilvl_d(t1, t0));
+    b1.val = __m128(__lsx_vilvh_d(t1, t0));
+    b2.val = __m128(__lsx_vilvl_d(t3, t2));
+    b3.val = __m128(__lsx_vilvh_d(t3, t2));
+}
+
+////////////////// Value reordering ////////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_LSX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin_lo, intrin_hi)     \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)                \
+    {                                                                              \
+        b0.val = intrin_lo(a.val, 0);                                              \
+        b1.val = intrin_hi(a.val);                                                 \
+    }                                                                              \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                                   \
+    { return _Tpwvec(intrin_lo(a.val, 0)); }                                       \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                                  \
+    { return _Tpwvec(intrin_hi(a.val)); }                                          \
+    inline _Tpwvec v_load_expand(const _Tp* ptr)                                   \
+    {                                                                              \
+        __m128i a = __lsx_vld(ptr, 0);                                             \
+        return _Tpwvec(intrin_lo(a, 0));                                           \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint8x16, v_uint16x8, uchar,     __lsx_vsllwil_hu_bu, __lsx_vexth_hu_bu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int8x16,  v_int16x8,  schar,     __lsx_vsllwil_h_b,   __lsx_vexth_h_b)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint16x8, v_uint32x4, ushort,    __lsx_vsllwil_wu_hu, __lsx_vexth_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int16x8,  v_int32x4,  short,     __lsx_vsllwil_w_h,   __lsx_vexth_w_h)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_uint32x4, v_uint64x2, unsigned,  __lsx_vsllwil_du_wu, __lsx_vexth_du_wu)
+OPENCV_HAL_IMPL_LSX_EXPAND(v_int32x4,  v_int64x2,  int,       __lsx_vsllwil_d_w,   __lsx_vexth_d_w)
+
+#define OPENCV_HAL_IMPL_LSX_EXPAND_Q(_Tpvec, _Tp, intrin_lo, intrin_hi)          \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)                                \
+    {                                                                            \
+        __m128i a = __lsx_vld(ptr, 0);                                           \
+        __m128i b = intrin_lo(a, 0);                                             \
+        return _Tpvec(intrin_hi(b, 0));                                          \
+    }
+
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_uint32x4, uchar, __lsx_vsllwil_hu_bu, __lsx_vsllwil_wu_hu)
+OPENCV_HAL_IMPL_LSX_EXPAND_Q(v_int32x4,  schar, __lsx_vsllwil_h_b,   __lsx_vsllwil_w_h)
+
+/* pack */
+// 16
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_lsx_packs_h(a.val, b.val)); }
+
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_lsx_packus_h(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrlrni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_bu_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_bu_h(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(__lsx_vssrarni_b_h(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_b_h(a.val, a.val, n), ptr, 0, 0); }
+
+//32
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, 0)); }
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, 0)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort *ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, 0), ptr,  0, 0); }
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, 0), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_uint16x8(__lsx_vssrlrni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrlrni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{ return v_uint16x8(__lsx_vssrarni_hu_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_hu_w(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(__lsx_vssrarni_h_w(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{ __lsx_vstelm_d(__lsx_vssrarni_h_w(a.val, a.val, n), ptr, 0, 0); }
+
+// 64
+// Non-saturaing pack
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vpickev_w(b.val, a.val)); }
+
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vshuf4i_w(a.val, 0x08), ptr, 0, 0); }
+
+inline void v_pack_store(int *ptr, const v_int64x2& a)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(a)); }
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint32x4(__lsx_vsrlrni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrlrni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{ return v_int32x4(__lsx_vsrarni_w_d(b.val, a.val, n)); }
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{ __lsx_vstelm_d(__lsx_vsrarni_w_d(a.val, a.val, n), ptr, 0, 0); }
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_uint8x16(__lsx_vssrarni_b_h(b.val, a.val, 0)); }
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    __m128i ab = __lsx_vssrarni_h_w(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_h_w(d.val, c.val, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(cd, ab, 0));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    __m128i ab = __lsx_vssrarni_w_d(b.val, a.val, 0);
+    __m128i cd = __lsx_vssrarni_w_d(d.val, c.val, 0);
+    __m128i ef = __lsx_vssrarni_w_d(f.val, e.val, 0);
+    __m128i gh = __lsx_vssrarni_w_d(h.val, g.val, 0);
+
+    __m128i abcd = __lsx_vssrarni_h_w(cd, ab, 0);
+    __m128i efgh = __lsx_vssrarni_h_w(gh, ef, 0);
+    return v_uint8x16(__lsx_vssrarni_b_h(efgh, abcd, 0));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_LSX_EXTRACT(_Tpvec)                    \
+    template<int s>                                            \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int8x16)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int16x8)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_uint64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_int64x2)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float32x4)
+OPENCV_HAL_IMPL_LSX_EXTRACT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_LSX_EXTRACT_N(_Tpvec, _Twvec, intrin)             \
+template<int i>                                                           \
+inline _Twvec v_extract_n(const _Tpvec& a)                                \
+{ return (_Twvec)intrin(a.val, i); }
+
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint8x16, uchar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int8x16,  schar,   __lsx_vpickve2gr_b)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint16x8, ushort,  __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int16x8,  short,   __lsx_vpickve2gr_h)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint32x4, uint,    __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int32x4,  int,     __lsx_vpickve2gr_w)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_uint64x2, uint64,  __lsx_vpickve2gr_d)
+OPENCV_HAL_IMPL_LSX_EXTRACT_N(v_int64x2,  int64,   __lsx_vpickve2gr_d)
+
+template<int i>
+inline float v_extract_n(const v_float32x4& v)
+{
+    union { uint iv; float fv; } d;
+    d.iv = __lsx_vpickve2gr_w(v.val, i);
+    return d.fv;
+}
+
+template<int i>
+inline double v_extract_n(const v_float64x2& v)
+{
+    union { uint64 iv; double dv; } d;
+    d.iv = __lsx_vpickve2gr_d(v.val, i);
+    return d.dv;
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{ return v_uint32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{ return v_int32x4(__lsx_vreplvei_w(a.val, i)); }
+
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{ return v_float32x4((__m128)__lsx_vreplvei_w((__m128i)a.val, i)); }
+
+/////////////////// load deinterleave //////////////////////////////
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+
+    a.val = __lsx_vpickev_b(t1, t0);
+    b.val = __lsx_vpickod_b(t1, t0);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_h(t1, t0);
+    b.val = __lsx_vpickod_h(t1, t0);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vpickev_w(t1, t0);
+    b.val = __lsx_vpickod_w(t1, t0);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_b(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i shuff1 = _v128_setr_b(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i b0 = __lsx_vbitsel_v(t1, t0, shuff1);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+    const __m128i shuff_a = _v128_setr_b(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29);
+    const __m128i shuff_b = _v128_setr_b(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30);
+    const __m128i shuff_c = _v128_setr_b(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    const __m128i shuff0 = _v128_setr_h(0, 0, -1, 0, 0, -1, 0, 0);
+    const __m128i shuff1 = _v128_setr_h(0, -1, 0, 0, -1, 0, 0, -1);
+
+    __m128i a0 = __lsx_vbitsel_v(t0, t1, shuff1);
+    __m128i b0 = __lsx_vbitsel_v(t0, t1, shuff0);
+    __m128i c0 = __lsx_vbitsel_v(t1, t0, shuff0);
+
+    const __m128i shuff_a = _v128_setr_b(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 20, 21, 26, 27);
+    const __m128i shuff_b = _v128_setr_b(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 16, 17, 22, 23, 28, 29);
+    const __m128i shuff_c = _v128_setr_b(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31);
+
+    a.val = __lsx_vshuf_b(t2, a0, shuff_a);
+    b.val = __lsx_vshuf_b(t2, b0, shuff_b);
+    c.val = __lsx_vshuf_b(t2, c0, shuff_c);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    __m128i a0 = __lsx_vpermi_w(t1, t0, 0xAC);
+    __m128i b0 = __lsx_vpermi_w(t1, t0, 0xC5);
+    __m128i c0 = __lsx_vpermi_w(t1, t0, 0x5A);
+
+    a.val = __lsx_vextrins_w(a0, t2, 0x31);
+    b0    = __lsx_vshuf4i_w(b0, 0x38);
+    c0    = __lsx_vshuf4i_w(c0, 0x8);
+    b.val = __lsx_vextrins_w(b0, t2, 0x32);
+    c.val = __lsx_vpermi_w(t2, c0, 0xC4);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+
+    a.val = __lsx_vshuf4i_d(t0, t1, 0xC);
+    b.val = __lsx_vshuf4i_d(t0, t2, 0x9);
+    c.val = __lsx_vshuf4i_d(t1, t2, 0xC);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_b(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_b(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_b(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_b(t3, t2);
+
+    a.val = __lsx_vpickev_b(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_b(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_b(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_b(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    __m128i ac_lo = __lsx_vpickev_h(t1, t0);
+    __m128i bd_lo = __lsx_vpickod_h(t1, t0);
+    __m128i ac_hi = __lsx_vpickev_h(t3, t2);
+    __m128i bd_hi = __lsx_vpickod_h(t3, t2);
+
+    a.val = __lsx_vpickev_h(ac_hi, ac_lo);
+    c.val = __lsx_vpickod_h(ac_hi, ac_lo);
+    b.val = __lsx_vpickev_h(bd_hi, bd_lo);
+    d.val = __lsx_vpickod_h(bd_hi, bd_lo);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i p0 = __lsx_vld(ptr, 0);
+    __m128i p1 = __lsx_vld(ptr, 16);
+    __m128i p2 = __lsx_vld(ptr, 32);
+    __m128i p3 = __lsx_vld(ptr, 48);
+
+    __m128i t0 = __lsx_vilvl_w(p1, p0);
+    __m128i t1 = __lsx_vilvl_w(p3, p2);
+    __m128i t2 = __lsx_vilvh_w(p1, p0);
+    __m128i t3 = __lsx_vilvh_w(p3, p2);
+    a.val = __lsx_vilvl_d(t1, t0);
+    b.val = __lsx_vilvh_d(t1, t0);
+    c.val = __lsx_vilvl_d(t3, t2);
+    d.val = __lsx_vilvh_d(t3, t2);
+}
+
+inline void v_load_deinterleave(const uint64* ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = __lsx_vld(ptr, 0);
+    __m128i t1 = __lsx_vld(ptr, 16);
+    __m128i t2 = __lsx_vld(ptr, 32);
+    __m128i t3 = __lsx_vld(ptr, 48);
+
+    a.val = __lsx_vilvl_d(t2, t0);
+    b.val = __lsx_vilvh_d(t2, t0);
+    c.val = __lsx_vilvl_d(t3, t1);
+    d.val = __lsx_vilvh_d(t3, t1);
+}
+
+////////////////////////// store interleave ////////////////////////////////
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_b(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_b(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_h(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_h(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_w(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_w(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i v1 = __lsx_vilvh_d(b.val, a.val);
+
+    __lsx_vst(v0, ptr, 0);
+    __lsx_vst(v1, ptr, 16);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10);
+    const __m128i shuff1 = _v128_setr_b(11, 21, 12, 13, 22, 14, 15, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 24, 18, 19, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i v_c = c.val;
+    const __m128i shuff0 = _v128_setr_b(0, 1, 2, 3, 16, 17, 4, 5, 6, 7, 18, 19, 8, 9, 10, 11);
+    const __m128i shuff1 = _v128_setr_b(20, 21, 12, 13, 14, 15, 22, 23, 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i shuff2 = _v128_setr_b(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 24, 25, 20, 21);
+    const __m128i shuff3 = _v128_setr_b(6, 7, 26, 27, 8, 9, 10, 11, 28, 29, 12, 13, 14, 15, 30, 31);
+    __m128i abc = __lsx_vpermi_w(v_c, ab_hi, 0xE4);
+
+    __m128i dst0 = __lsx_vshuf_b(v_c, ab_lo, shuff0);
+    __m128i dst1 = __lsx_vshuf_b(v_c, ab_lo, shuff1);
+    __m128i dst2 = __lsx_vshuf_b(v_c, ab_hi, shuff3);
+    dst1 = __lsx_vshuf_b(abc, dst1, shuff2);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i v_c = c.val;
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);  //a0 b0 a1 b1
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);  //a2 b2 a3 b3
+    __m128i bc_od = __lsx_vpackod_w(v_c, b.val); // b1 c1 b3 c3
+
+    __m128i dst0 = __lsx_vshuf4i_w(ab_lo, 0xB4);  //a0 b0 b1 a1
+    __m128i dst1 = __lsx_vilvl_d(ab_hi, bc_od); //b1 c1 a2 b2
+    __m128i dst2 = __lsx_vpermi_w(bc_od, ab_hi, 0xE8); //a2, a3, b3, c3
+
+    dst0 = __lsx_vextrins_w(dst0, v_c, 0x20);
+    dst2 = __lsx_vextrins_w(dst2, v_c, 0x2);
+    __lsx_vst(dst0, ptr, 0);  //a0 b0 c0 a1
+    __lsx_vst(dst1, ptr, 16); //b1 c1 a2 b2
+    __lsx_vst(dst2, ptr, 32); //c2 a3 b3 c3
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst1 = __lsx_vpermi_w(a.val, c.val, 0xE4);
+    __m128i dst2 = __lsx_vilvh_d(c.val, b.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+}
+
+inline void v_store_interleave(uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                               const v_uint8x16& c, const v_uint8x16& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_b(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_b(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_b(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_b(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_h(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_h(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_h(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_h(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                               const v_uint16x8& c, const v_uint16x8& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_h(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_h(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_h(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_h(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_w(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_w(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_w(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_w(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i ab_lo = __lsx_vilvl_w(b.val, a.val);
+    __m128i ab_hi = __lsx_vilvh_w(b.val, a.val);
+    __m128i cd_lo = __lsx_vilvl_w(d.val, c.val);
+    __m128i cd_hi = __lsx_vilvh_w(d.val, c.val);
+
+    __m128i dst0 = __lsx_vilvl_d(cd_lo, ab_lo);
+    __m128i dst1 = __lsx_vilvh_d(cd_lo, ab_lo);
+    __m128i dst2 = __lsx_vilvl_d(cd_hi, ab_hi);
+    __m128i dst3 = __lsx_vilvh_d(cd_hi, ab_hi);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+inline void v_store_interleave(uint64* ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    __m128i dst0 = __lsx_vilvl_d(b.val, a.val);
+    __m128i dst2 = __lsx_vilvh_d(b.val, a.val);
+    __m128i dst1 = __lsx_vilvl_d(d.val, c.val);
+    __m128i dst3 = __lsx_vilvh_d(d.val, c.val);
+
+    __lsx_vst(dst0, ptr, 0);
+    __lsx_vst(dst1, ptr, 16);
+    __lsx_vst(dst2, ptr, 32);
+    __lsx_vst(dst3, ptr, 48);
+}
+
+#define OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1)  \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0)                        \
+{                                                                                                 \
+    _Tpvec1 a1, b1;                                                                               \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1);                                                \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0)           \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1;                                                                           \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1);                                            \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+}                                                                                                 \
+inline void v_load_deinterleave(const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0,                        \
+                                _Tpvec0& c0, _Tpvec0& d0)                                         \
+{                                                                                                 \
+    _Tpvec1 a1, b1, c1, d1;                                                                       \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1);                                        \
+    a0 = v_reinterpret_as_##suffix0(a1);                                                          \
+    b0 = v_reinterpret_as_##suffix0(b1);                                                          \
+    c0 = v_reinterpret_as_##suffix0(c1);                                                          \
+    d0 = v_reinterpret_as_##suffix0(d1);                                                          \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1);                                                     \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0,\
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1);                                                 \
+}                                                                                                 \
+inline void v_store_interleave(_Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0,                   \
+                               const _Tpvec0& c0, const _Tpvec0& d0,                              \
+                               hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)                      \
+{                                                                                                 \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0);                                                  \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0);                                                  \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0);                                                  \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0);                                                  \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1);                                             \
+}
+
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_LSX_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+//
+// FP16
+//
+
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+#if CV_FP16
+    return v_float32x4(__lsx_vfcvtl_s_h((__m128)__lsx_vld(ptr, 0)));
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    for (int i = 0; i < 4; i++)
+        buf[i] = (float)ptr[i];
+    return v_float32x4((__m128)__lsx_vld(buf, 0));
+#endif
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32x4& a)
+{
+#if CV_FP16
+    __m128i res = (__m218i)__lsx_vfcvt_h_s(a.val, a.val);
+    __lsx_vstelm_d(res, ptr, 0, 0);
+#else
+    float CV_DECL_ALIGNED(32) buf[4];
+    v_store_aligned(buf, a);
+    for (int i = 0; i < 4; i++)
+        ptr[i] = hfloat(buf[i]);
+#endif
+}
+
+//
+// end of FP16
+//
+
+inline void v_cleanup() {}
+
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_LSX_HPP
diff --git a/3rdParty/opencv2/core/hal/intrin_math.hpp b/3rdParty/opencv2/core/hal/intrin_math.hpp
new file mode 100644
index 0000000000..b7e649e744
--- /dev/null
+++ b/3rdParty/opencv2/core/hal/intrin_math.hpp
@@ -0,0 +1,687 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+
+/* Universal Intrinsics implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2010,2011  RJVB - extensions */
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+#ifndef OPENCV_HAL_INTRIN_MATH_HPP
+#define OPENCV_HAL_INTRIN_MATH_HPP
+
+//! @name Exponential
+//! @{
+// Implementation is the same as float32 vector.
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_exp_default_16f(const _TpVec16F &x) {
+    const _TpVec16F _vexp_lo_f16 = v_setall_<_TpVec16F>(-10.7421875f);
+    const _TpVec16F _vexp_hi_f16 = v_setall_<_TpVec16F>(11.f);
+    const _TpVec16F _vexp_half_fp16 = v_setall_<_TpVec16F>(0.5f);
+    const _TpVec16F _vexp_one_fp16 = v_setall_<_TpVec16F>(1.f);
+    const _TpVec16F _vexp_LOG2EF_f16 = v_setall_<_TpVec16F>(1.44269504088896341f);
+    const _TpVec16F _vexp_C1_f16 = v_setall_<_TpVec16F>(-6.93359375E-1f);
+    const _TpVec16F _vexp_C2_f16 = v_setall_<_TpVec16F>(2.12194440E-4f);
+    const _TpVec16F _vexp_p0_f16 = v_setall_<_TpVec16F>(1.9875691500E-4f);
+    const _TpVec16F _vexp_p1_f16 = v_setall_<_TpVec16F>(1.3981999507E-3f);
+    const _TpVec16F _vexp_p2_f16 = v_setall_<_TpVec16F>(8.3334519073E-3f);
+    const _TpVec16F _vexp_p3_f16 = v_setall_<_TpVec16F>(4.1665795894E-2f);
+    const _TpVec16F _vexp_p4_f16 = v_setall_<_TpVec16F>(1.6666665459E-1f);
+    const _TpVec16F _vexp_p5_f16 = v_setall_<_TpVec16F>(5.0000001201E-1f);
+
+    _TpVec16F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+    _TpVec16S _vexp_mm;
+    const _TpVec16S _vexp_bias_s16 = v_setall_<_TpVec16S>((short)0xf);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f16);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f16);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f16, _vexp_half_fp16);
+    _vexp_mm = v_floor(_vexp_);
+    _vexp_ = v_cvt_f16(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s16);
+    _vexp_mm = v_shl(_vexp_mm, 10);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f16, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f16, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_x, _vexp_p0_f16, _vexp_p1_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f16);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f16);
+
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+    _vexp_y = v_add(_vexp_y, _vexp_one_fp16);
+    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f16(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec16F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_exp_default_32f(const _TpVec32F &x) {
+    const _TpVec32F _vexp_lo_f32 = v_setall_<_TpVec32F>(-88.3762626647949f);
+    const _TpVec32F _vexp_hi_f32 = v_setall_<_TpVec32F>(89.f);
+    const _TpVec32F _vexp_half_fp32 = v_setall_<_TpVec32F>(0.5f);
+    const _TpVec32F _vexp_one_fp32 = v_setall_<_TpVec32F>(1.f);
+    const _TpVec32F _vexp_LOG2EF_f32 = v_setall_<_TpVec32F>(1.44269504088896341f);
+    const _TpVec32F _vexp_C1_f32 = v_setall_<_TpVec32F>(-6.93359375E-1f);
+    const _TpVec32F _vexp_C2_f32 = v_setall_<_TpVec32F>(2.12194440E-4f);
+    const _TpVec32F _vexp_p0_f32 = v_setall_<_TpVec32F>(1.9875691500E-4f);
+    const _TpVec32F _vexp_p1_f32 = v_setall_<_TpVec32F>(1.3981999507E-3f);
+    const _TpVec32F _vexp_p2_f32 = v_setall_<_TpVec32F>(8.3334519073E-3f);
+    const _TpVec32F _vexp_p3_f32 = v_setall_<_TpVec32F>(4.1665795894E-2f);
+    const _TpVec32F _vexp_p4_f32 = v_setall_<_TpVec32F>(1.6666665459E-1f);
+    const _TpVec32F _vexp_p5_f32 = v_setall_<_TpVec32F>(5.0000001201E-1f);
+
+    _TpVec32F _vexp_, _vexp_x, _vexp_y, _vexp_xx;
+    _TpVec32S _vexp_mm;
+    const _TpVec32S _vexp_bias_s32 = v_setall_<_TpVec32S>((int)0x7f);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f32);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f32);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f32, _vexp_half_fp32);
+    _vexp_mm = v_floor(_vexp_);
+    _vexp_ = v_cvt_f32(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s32);
+    _vexp_mm = v_shl(_vexp_mm, 23);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f32, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f32, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_x, _vexp_p0_f32, _vexp_p1_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p2_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p3_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p4_f32);
+    _vexp_y = v_fma(_vexp_y, _vexp_x, _vexp_p5_f32);
+
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_x);
+    _vexp_y = v_add(_vexp_y, _vexp_one_fp32);
+    _vexp_y = v_mul(_vexp_y, v_reinterpret_as_f32(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec32F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_y, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_exp_default_64f(const _TpVec64F &x) {
+    const _TpVec64F _vexp_lo_f64 = v_setall_<_TpVec64F>(-709.43613930310391424428);
+    const _TpVec64F _vexp_hi_f64 = v_setall_<_TpVec64F>(710.);
+    const _TpVec64F _vexp_half_f64 = v_setall_<_TpVec64F>(0.5);
+    const _TpVec64F _vexp_one_f64 = v_setall_<_TpVec64F>(1.0);
+    const _TpVec64F _vexp_two_f64 = v_setall_<_TpVec64F>(2.0);
+    const _TpVec64F _vexp_LOG2EF_f64 = v_setall_<_TpVec64F>(1.44269504088896340736);
+    const _TpVec64F _vexp_C1_f64 = v_setall_<_TpVec64F>(-6.93145751953125E-1);
+    const _TpVec64F _vexp_C2_f64 = v_setall_<_TpVec64F>(-1.42860682030941723212E-6);
+    const _TpVec64F _vexp_p0_f64 = v_setall_<_TpVec64F>(1.26177193074810590878E-4);
+    const _TpVec64F _vexp_p1_f64 = v_setall_<_TpVec64F>(3.02994407707441961300E-2);
+    const _TpVec64F _vexp_p2_f64 = v_setall_<_TpVec64F>(9.99999999999999999910E-1);
+    const _TpVec64F _vexp_q0_f64 = v_setall_<_TpVec64F>(3.00198505138664455042E-6);
+    const _TpVec64F _vexp_q1_f64 = v_setall_<_TpVec64F>(2.52448340349684104192E-3);
+    const _TpVec64F _vexp_q2_f64 = v_setall_<_TpVec64F>(2.27265548208155028766E-1);
+    const _TpVec64F _vexp_q3_f64 = v_setall_<_TpVec64F>(2.00000000000000000009E0);
+
+    _TpVec64F _vexp_, _vexp_x, _vexp_y, _vexp_z, _vexp_xx;
+    _TpVec64S _vexp_mm;
+    const _TpVec64S _vexp_bias_s64 = v_setall_<_TpVec64S>((int64)0x3ff);
+
+    // compute exponential of x
+    _vexp_x = v_max(x, _vexp_lo_f64);
+    _vexp_x = v_min(_vexp_x, _vexp_hi_f64);
+
+    _vexp_ = v_fma(_vexp_x, _vexp_LOG2EF_f64, _vexp_half_f64);
+    _vexp_mm = v_expand_low(v_floor(_vexp_));
+    _vexp_ = v_cvt_f64(_vexp_mm);
+    _vexp_mm = v_add(_vexp_mm, _vexp_bias_s64);
+    _vexp_mm = v_shl(_vexp_mm, 52);
+
+    _vexp_x = v_fma(_vexp_, _vexp_C1_f64, _vexp_x);
+    _vexp_x = v_fma(_vexp_, _vexp_C2_f64, _vexp_x);
+    _vexp_xx = v_mul(_vexp_x, _vexp_x);
+
+    _vexp_y = v_fma(_vexp_xx, _vexp_p0_f64, _vexp_p1_f64);
+    _vexp_y = v_fma(_vexp_y, _vexp_xx, _vexp_p2_f64);
+    _vexp_y = v_mul(_vexp_y, _vexp_x);
+
+    _vexp_z = v_fma(_vexp_xx, _vexp_q0_f64, _vexp_q1_f64);
+    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q2_f64);
+    _vexp_z = v_fma(_vexp_xx, _vexp_z, _vexp_q3_f64);
+
+    _vexp_z = v_div(_vexp_y, v_sub(_vexp_z, _vexp_y));
+    _vexp_z = v_fma(_vexp_two_f64, _vexp_z, _vexp_one_f64);
+    _vexp_z = v_mul(_vexp_z, v_reinterpret_as_f64(_vexp_mm));
+
+    // exp(NAN) -> NAN
+    _TpVec64F mask_not_nan = v_not_nan(x);
+    return v_select(mask_not_nan, _vexp_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7FF8000000000000)));
+}
+//! @}
+
+//! @name Natural Logarithm
+//! @{
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_log_default_16f(const _TpVec16F &x) {
+    const _TpVec16F _vlog_one_fp16 = v_setall_<_TpVec16F>(1.0f);
+    const _TpVec16F _vlog_SQRTHF_fp16 = v_setall_<_TpVec16F>(0.707106781186547524f);
+    const _TpVec16F _vlog_q1_fp16 = v_setall_<_TpVec16F>(-2.12194440E-4f);
+    const _TpVec16F _vlog_q2_fp16 = v_setall_<_TpVec16F>(0.693359375f);
+    const _TpVec16F _vlog_p0_fp16 = v_setall_<_TpVec16F>(7.0376836292E-2f);
+    const _TpVec16F _vlog_p1_fp16 = v_setall_<_TpVec16F>(-1.1514610310E-1f);
+    const _TpVec16F _vlog_p2_fp16 = v_setall_<_TpVec16F>(1.1676998740E-1f);
+    const _TpVec16F _vlog_p3_fp16 = v_setall_<_TpVec16F>(-1.2420140846E-1f);
+    const _TpVec16F _vlog_p4_fp16 = v_setall_<_TpVec16F>(1.4249322787E-1f);
+    const _TpVec16F _vlog_p5_fp16 = v_setall_<_TpVec16F>(-1.6668057665E-1f);
+    const _TpVec16F _vlog_p6_fp16 = v_setall_<_TpVec16F>(2.0000714765E-1f);
+    const _TpVec16F _vlog_p7_fp16 = v_setall_<_TpVec16F>(-2.4999993993E-1f);
+    const _TpVec16F _vlog_p8_fp16 = v_setall_<_TpVec16F>(3.3333331174E-1f);
+
+    _TpVec16F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+    _TpVec16S _vlog_ux, _vlog_emm0;
+    const _TpVec16S _vlog_inv_mant_mask_s16 = v_setall_<_TpVec16S>((short)~0x7c00);
+
+    _vlog_ux = v_reinterpret_as_s16(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 10);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s16);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s16(v_setall_<_TpVec16F>(0.5f)));
+    _vlog_x = v_reinterpret_as_f16(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec16S>((short)0xf));
+    _vlog_e = v_cvt_f16(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp16);
+
+    _TpVec16F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp16);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp16);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp16, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp16, _vlog_x, _vlog_p1_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp16);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp16);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp16, _vlog_y);
+
+    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec16F>(0.5f)));
+
+    _vlog_x = v_add(_vlog_x, _vlog_y);
+    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp16, _vlog_x);
+    // log(0) -> -INF
+    _TpVec16F mask_zero = v_eq(x, v_setzero_<_TpVec16F>());
+    _vlog_x = v_select(mask_zero, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0xfc00)), _vlog_x);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec16F mask_not_nan = v_ge(x, v_setzero_<_TpVec16F>());
+    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00)));
+    // log(INF) -> INF
+    _TpVec16F mask_inf = v_eq(x, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
+    _vlog_x = v_select(mask_inf, x, _vlog_x);
+    return _vlog_x;
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_log_default_32f(const _TpVec32F &x) {
+    const _TpVec32F _vlog_one_fp32 = v_setall_<_TpVec32F>(1.0f);
+    const _TpVec32F _vlog_SQRTHF_fp32 = v_setall_<_TpVec32F>(0.707106781186547524f);
+    const _TpVec32F _vlog_q1_fp32 = v_setall_<_TpVec32F>(-2.12194440E-4f);
+    const _TpVec32F _vlog_q2_fp32 = v_setall_<_TpVec32F>(0.693359375f);
+    const _TpVec32F _vlog_p0_fp32 = v_setall_<_TpVec32F>(7.0376836292E-2f);
+    const _TpVec32F _vlog_p1_fp32 = v_setall_<_TpVec32F>(-1.1514610310E-1f);
+    const _TpVec32F _vlog_p2_fp32 = v_setall_<_TpVec32F>(1.1676998740E-1f);
+    const _TpVec32F _vlog_p3_fp32 = v_setall_<_TpVec32F>(-1.2420140846E-1f);
+    const _TpVec32F _vlog_p4_fp32 = v_setall_<_TpVec32F>(1.4249322787E-1f);
+    const _TpVec32F _vlog_p5_fp32 = v_setall_<_TpVec32F>(-1.6668057665E-1f);
+    const _TpVec32F _vlog_p6_fp32 = v_setall_<_TpVec32F>(2.0000714765E-1f);
+    const _TpVec32F _vlog_p7_fp32 = v_setall_<_TpVec32F>(-2.4999993993E-1f);
+    const _TpVec32F _vlog_p8_fp32 = v_setall_<_TpVec32F>(3.3333331174E-1f);
+
+    _TpVec32F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp;
+    _TpVec32S _vlog_ux, _vlog_emm0;
+    const _TpVec32S _vlog_inv_mant_mask_s32 = v_setall_<_TpVec32S>((int)~0x7f800000);
+
+    _vlog_ux = v_reinterpret_as_s32(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 23);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s32);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s32(v_setall_<_TpVec32F>(0.5f)));
+    _vlog_x = v_reinterpret_as_f32(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec32S>((int)0x7f));
+    _vlog_e = v_cvt_f32(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp32);
+
+    _TpVec32F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp32);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp32);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp32, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_z = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp32, _vlog_x, _vlog_p1_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p6_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p7_fp32);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p8_fp32);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_z);
+
+    _vlog_y = v_fma(_vlog_e, _vlog_q1_fp32, _vlog_y);
+
+    _vlog_y = v_sub(_vlog_y, v_mul(_vlog_z, v_setall_<_TpVec32F>(0.5f)));
+
+    _vlog_x = v_add(_vlog_x, _vlog_y);
+    _vlog_x = v_fma(_vlog_e, _vlog_q2_fp32, _vlog_x);
+    // log(0) -> -INF
+    _TpVec32F mask_zero = v_eq(x, v_setzero_<_TpVec32F>());
+    _vlog_x = v_select(mask_zero, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0xff800000)), _vlog_x);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec32F mask_not_nan = v_ge(x, v_setzero_<_TpVec32F>());
+    _vlog_x = v_select(mask_not_nan, _vlog_x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000)));
+    // log(INF) -> INF
+    _TpVec32F mask_inf = v_eq(x, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
+    _vlog_x = v_select(mask_inf, x, _vlog_x);
+    return _vlog_x;
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_log_default_64f(const _TpVec64F &x) {
+    const _TpVec64F _vlog_one_fp64 = v_setall_<_TpVec64F>(1.0);
+    const _TpVec64F _vlog_SQRTHF_fp64 = v_setall_<_TpVec64F>(0.7071067811865475244);
+    const _TpVec64F _vlog_p0_fp64 = v_setall_<_TpVec64F>(1.01875663804580931796E-4);
+    const _TpVec64F _vlog_p1_fp64 = v_setall_<_TpVec64F>(4.97494994976747001425E-1);
+    const _TpVec64F _vlog_p2_fp64 = v_setall_<_TpVec64F>(4.70579119878881725854);
+    const _TpVec64F _vlog_p3_fp64 = v_setall_<_TpVec64F>(1.44989225341610930846E1);
+    const _TpVec64F _vlog_p4_fp64 = v_setall_<_TpVec64F>(1.79368678507819816313E1);
+    const _TpVec64F _vlog_p5_fp64 = v_setall_<_TpVec64F>(7.70838733755885391666);
+    const _TpVec64F _vlog_q0_fp64 = v_setall_<_TpVec64F>(1.12873587189167450590E1);
+    const _TpVec64F _vlog_q1_fp64 = v_setall_<_TpVec64F>(4.52279145837532221105E1);
+    const _TpVec64F _vlog_q2_fp64 = v_setall_<_TpVec64F>(8.29875266912776603211E1);
+    const _TpVec64F _vlog_q3_fp64 = v_setall_<_TpVec64F>(7.11544750618563894466E1);
+    const _TpVec64F _vlog_q4_fp64 = v_setall_<_TpVec64F>(2.31251620126765340583E1);
+
+    const _TpVec64F _vlog_C0_fp64 = v_setall_<_TpVec64F>(2.121944400546905827679e-4);
+    const _TpVec64F _vlog_C1_fp64 = v_setall_<_TpVec64F>(0.693359375);
+
+    _TpVec64F _vlog_x, _vlog_e, _vlog_y, _vlog_z, _vlog_tmp, _vlog_xx;
+    _TpVec64S _vlog_ux, _vlog_emm0;
+    const _TpVec64S _vlog_inv_mant_mask_s64 = v_setall_<_TpVec64S>((int64)~0x7ff0000000000000);
+
+    _vlog_ux = v_reinterpret_as_s64(x);
+    _vlog_emm0 = v_shr(_vlog_ux, 52);
+
+    _vlog_ux = v_and(_vlog_ux, _vlog_inv_mant_mask_s64);
+    _vlog_ux = v_or(_vlog_ux, v_reinterpret_as_s64(v_setall_<_TpVec64F>(0.5)));
+    _vlog_x = v_reinterpret_as_f64(_vlog_ux);
+
+    _vlog_emm0 = v_sub(_vlog_emm0, v_setall_<_TpVec64S>((int64)0x3ff));
+    _vlog_e = v_cvt_f64(_vlog_emm0);
+
+    _vlog_e = v_add(_vlog_e, _vlog_one_fp64);
+
+    _TpVec64F _vlog_mask = v_lt(_vlog_x, _vlog_SQRTHF_fp64);
+    _vlog_tmp = v_and(_vlog_x, _vlog_mask);
+    _vlog_x = v_sub(_vlog_x, _vlog_one_fp64);
+    _vlog_e = v_sub(_vlog_e, v_and(_vlog_one_fp64, _vlog_mask));
+    _vlog_x = v_add(_vlog_x, _vlog_tmp);
+
+    _vlog_xx = v_mul(_vlog_x, _vlog_x);
+
+    _vlog_y = v_fma(_vlog_p0_fp64, _vlog_x, _vlog_p1_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p2_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p3_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p4_fp64);
+    _vlog_y = v_fma(_vlog_y, _vlog_x, _vlog_p5_fp64);
+    _vlog_y = v_mul(_vlog_y, _vlog_x);
+    _vlog_y = v_mul(_vlog_y, _vlog_xx);
+
+    _vlog_z = v_add(_vlog_x, _vlog_q0_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q1_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q2_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q3_fp64);
+    _vlog_z = v_fma(_vlog_z, _vlog_x, _vlog_q4_fp64);
+
+    _vlog_z = v_div(_vlog_y, _vlog_z);
+    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_e, _vlog_C0_fp64));
+    _vlog_z = v_sub(_vlog_z, v_mul(_vlog_xx, v_setall_<_TpVec64F>(0.5)));
+
+    _vlog_z = v_add(_vlog_z, _vlog_x);
+    _vlog_z = v_fma(_vlog_e, _vlog_C1_fp64, _vlog_z);
+
+    // log(0) -> -INF
+    _TpVec64F mask_zero = v_eq(x, v_setzero_<_TpVec64F>());
+    _vlog_z = v_select(mask_zero, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0xfff0000000000000)), _vlog_z);
+    // log(NEG), log(NAN) -> NAN
+    _TpVec64F mask_not_nan = v_ge(x, v_setzero_<_TpVec64F>());
+    _vlog_z = v_select(mask_not_nan, _vlog_z, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000)));
+    // log(INF) -> INF
+    _TpVec64F mask_inf = v_eq(x, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
+    _vlog_z = v_select(mask_inf, x, _vlog_z);
+    return _vlog_z;
+}
+//! @}
+
+//! @name Sine and Cosine
+//! @{
+template<typename _TpVec16F, typename _TpVec16S>
+inline void v_sincos_default_16f(const _TpVec16F &x, _TpVec16F &ysin, _TpVec16F &ycos) {
+    const _TpVec16F v_cephes_FOPI = v_setall_<_TpVec16F>(hfloat(1.27323954473516f)); // 4 / M_PI
+    const _TpVec16F v_minus_DP1 = v_setall_<_TpVec16F>(hfloat(-0.78515625f));
+    const _TpVec16F v_minus_DP2 = v_setall_<_TpVec16F>(hfloat(-2.4187564849853515625E-4f));
+    const _TpVec16F v_minus_DP3 = v_setall_<_TpVec16F>(hfloat(-3.77489497744594108E-8f));
+    const _TpVec16F v_sincof_p0 = v_setall_<_TpVec16F>(hfloat(-1.9515295891E-4f));
+    const _TpVec16F v_sincof_p1 = v_setall_<_TpVec16F>(hfloat(8.3321608736E-3f));
+    const _TpVec16F v_sincof_p2 = v_setall_<_TpVec16F>(hfloat(-1.6666654611E-1f));
+    const _TpVec16F v_coscof_p0 = v_setall_<_TpVec16F>(hfloat(2.443315711809948E-5f));
+    const _TpVec16F v_coscof_p1 = v_setall_<_TpVec16F>(hfloat(-1.388731625493765E-3f));
+    const _TpVec16F v_coscof_p2 = v_setall_<_TpVec16F>(hfloat(4.166664568298827E-2f));
+    const _TpVec16F v_nan = v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7e00));
+    const _TpVec16F v_neg_zero = v_setall_<_TpVec16F>(hfloat(-0.f));
+
+    _TpVec16F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec16S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec16F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_trunc(_vy);
+    emm2 = v_add(emm2, v_setall_<_TpVec16S>((short)1));
+    emm2 = v_and(emm2, v_setall_<_TpVec16S>((short)~1));
+    _vy = v_cvt_f16(emm2);
+
+    _TpVec16F poly_mask = v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f16(v_eq(v_and(emm2, v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0))));
+    sign_mask_cos = v_reinterpret_as_f16(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec16S>((short)2)), v_setall_<_TpVec16S>((short)4)), v_setall_<_TpVec16S>((short)0)));
+
+    _TpVec16F _vxx = v_mul(_vx, _vx);
+    _TpVec16F y1, y2;
+
+    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
+    y1 = v_fma(y1, _vxx, v_coscof_p2);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(-0.5f)));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec16F>(hfloat(1.f)));
+
+    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
+    y2 = v_fma(y2, _vxx, v_sincof_p2);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec16F mask_inf = v_eq(_vx, v_reinterpret_as_f16(v_setall_<_TpVec16S>((short)0x7c00)));
+    _TpVec16F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
+
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_sin_default_16f(const _TpVec16F &x) {
+    _TpVec16F ysin, ycos;
+    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec16F, typename _TpVec16S>
+inline _TpVec16F v_cos_default_16f(const _TpVec16F &x) {
+    _TpVec16F ysin, ycos;
+    v_sincos_default_16f<_TpVec16F, _TpVec16S>(x, ysin, ycos);
+    return ycos;
+}
+
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline void v_sincos_default_32f(const _TpVec32F &x, _TpVec32F &ysin, _TpVec32F &ycos) {
+    const _TpVec32F v_cephes_FOPI = v_setall_<_TpVec32F>(1.27323954473516f); // 4 / M_PI
+    const _TpVec32F v_minus_DP1 = v_setall_<_TpVec32F>(-0.78515625f);
+    const _TpVec32F v_minus_DP2 = v_setall_<_TpVec32F>(-2.4187564849853515625E-4f);
+    const _TpVec32F v_minus_DP3 = v_setall_<_TpVec32F>(-3.77489497744594108E-8f);
+    const _TpVec32F v_sincof_p0 = v_setall_<_TpVec32F>(-1.9515295891E-4f);
+    const _TpVec32F v_sincof_p1 = v_setall_<_TpVec32F>(8.3321608736E-3f);
+    const _TpVec32F v_sincof_p2 = v_setall_<_TpVec32F>(-1.6666654611E-1f);
+    const _TpVec32F v_coscof_p0 = v_setall_<_TpVec32F>(2.443315711809948E-5f);
+    const _TpVec32F v_coscof_p1 = v_setall_<_TpVec32F>(-1.388731625493765E-3f);
+    const _TpVec32F v_coscof_p2 = v_setall_<_TpVec32F>(4.166664568298827E-2f);
+    const _TpVec32F v_nan = v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7fc00000));
+    const _TpVec32F v_neg_zero = v_setall_<_TpVec32F>(-0.f);
+
+    _TpVec32F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec32S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec32F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_trunc(_vy);
+    emm2 = v_add(emm2, v_setall_<_TpVec32S>(1));
+    emm2 = v_and(emm2, v_setall_<_TpVec32S>(~1));
+    _vy = v_cvt_f32(emm2);
+
+    _TpVec32F poly_mask = v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f32(v_eq(v_and(emm2, v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0))));
+    sign_mask_cos = v_reinterpret_as_f32(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec32S>(2)), v_setall_<_TpVec32S>(4)), v_setall_<_TpVec32S>(0)));
+
+    _TpVec32F _vxx = v_mul(_vx, _vx);
+    _TpVec32F y1, y2;
+
+    y1 = v_fma(v_coscof_p0, _vxx, v_coscof_p1);
+    y1 = v_fma(y1, _vxx, v_coscof_p2);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(-0.5f));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec32F>(1.f));
+
+    y2 = v_fma(v_sincof_p0, _vxx, v_sincof_p1);
+    y2 = v_fma(y2, _vxx, v_sincof_p2);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec32F mask_inf = v_eq(_vx, v_reinterpret_as_f32(v_setall_<_TpVec32S>((int)0x7f800000)));
+    _TpVec32F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_sin_default_32f(const _TpVec32F &x) {
+    _TpVec32F ysin, ycos;
+    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_cos_default_32f(const _TpVec32F &x) {
+    _TpVec32F ysin, ycos;
+    v_sincos_default_32f<_TpVec32F, _TpVec32S>(x, ysin, ycos);
+    return ycos;
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline void v_sincos_default_64f(const _TpVec64F &x, _TpVec64F &ysin, _TpVec64F &ycos) {
+    const _TpVec64F v_cephes_FOPI = v_setall_<_TpVec64F>(1.2732395447351626861510701069801148); // 4 / M_PI
+    const _TpVec64F v_minus_DP1 = v_setall_<_TpVec64F>(-7.853981554508209228515625E-1);
+    const _TpVec64F v_minus_DP2 = v_setall_<_TpVec64F>(-7.94662735614792836714E-9);
+    const _TpVec64F v_minus_DP3 = v_setall_<_TpVec64F>(-3.06161699786838294307E-17);
+    const _TpVec64F v_sin_C1 = v_setall_<_TpVec64F>(1.58962301576546568060E-10);
+    const _TpVec64F v_sin_C2 = v_setall_<_TpVec64F>(-2.50507477628578072866E-8);
+    const _TpVec64F v_sin_C3 = v_setall_<_TpVec64F>(2.75573136213857245213E-6);
+    const _TpVec64F v_sin_C4 = v_setall_<_TpVec64F>(-1.98412698295895385996E-4);
+    const _TpVec64F v_sin_C5 = v_setall_<_TpVec64F>(8.33333333332211858878E-3);
+    const _TpVec64F v_sin_C6 = v_setall_<_TpVec64F>(-1.66666666666666307295E-1);
+    const _TpVec64F v_cos_C1 = v_setall_<_TpVec64F>(-1.13585365213876817300E-11);
+    const _TpVec64F v_cos_C2 = v_setall_<_TpVec64F>(2.08757008419747316778E-9);
+    const _TpVec64F v_cos_C3 = v_setall_<_TpVec64F>(-2.75573141792967388112E-7);
+    const _TpVec64F v_cos_C4 = v_setall_<_TpVec64F>(2.48015872888517045348E-5);
+    const _TpVec64F v_cos_C5 = v_setall_<_TpVec64F>(-1.38888888888730564116E-3);
+    const _TpVec64F v_cos_C6 = v_setall_<_TpVec64F>(4.16666666666665929218E-2);
+    const _TpVec64F v_nan = v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff8000000000000));
+    const _TpVec64F v_neg_zero = v_setall_<_TpVec64F>(-0.0);
+
+    _TpVec64F _vx, _vy, sign_mask_sin, sign_mask_cos;
+    _TpVec64S emm2;
+
+    sign_mask_sin = v_lt(x, v_setzero_<_TpVec64F>());
+    _vx = v_abs(x);
+    _vy = v_mul(_vx, v_cephes_FOPI);
+
+    emm2 = v_expand_low(v_trunc(_vy));
+    emm2 = v_add(emm2, v_setall_<_TpVec64S>((int64)1));
+    emm2 = v_and(emm2, v_setall_<_TpVec64S>((int64)~1));
+    _vy = v_cvt_f64(emm2);
+
+    _TpVec64F poly_mask = v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)0)));
+
+    _vx = v_fma(_vy, v_minus_DP1, _vx);
+    _vx = v_fma(_vy, v_minus_DP2, _vx);
+    _vx = v_fma(_vy, v_minus_DP3, _vx);
+
+    sign_mask_sin = v_xor(sign_mask_sin, v_reinterpret_as_f64(v_eq(v_and(emm2, v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0))));
+    sign_mask_cos = v_reinterpret_as_f64(v_eq(v_and(v_sub(emm2, v_setall_<_TpVec64S>((int64)2)), v_setall_<_TpVec64S>((int64)4)), v_setall_<_TpVec64S>((int64)0)));
+
+    _TpVec64F _vxx = v_mul(_vx, _vx);
+    _TpVec64F y1, y2;
+
+    y1 = v_fma(v_cos_C1, _vxx, v_cos_C2);
+    y1 = v_fma(y1, _vxx, v_cos_C3);
+    y1 = v_fma(y1, _vxx, v_cos_C4);
+    y1 = v_fma(y1, _vxx, v_cos_C5);
+    y1 = v_fma(y1, _vxx, v_cos_C6);
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(-0.5));
+    y1 = v_fma(y1, _vxx, v_setall_<_TpVec64F>(1.0));
+
+    y2 = v_fma(v_sin_C1, _vxx, v_sin_C2);
+    y2 = v_fma(y2, _vxx, v_sin_C3);
+    y2 = v_fma(y2, _vxx, v_sin_C4);
+    y2 = v_fma(y2, _vxx, v_sin_C5);
+    y2 = v_fma(y2, _vxx, v_sin_C6);
+    y2 = v_mul(y2, _vxx);
+    y2 = v_fma(y2, _vx, _vx);
+
+    ysin = v_select(poly_mask, y2, y1);
+    ycos = v_select(poly_mask, y1, y2);
+    ysin = v_select(sign_mask_sin, ysin, v_xor(v_neg_zero, ysin));
+    ycos = v_select(sign_mask_cos, v_xor(v_neg_zero, ycos), ycos);
+
+    // sincos(NAN) -> NAN, sincos(±INF) -> NAN
+    _TpVec64F mask_inf = v_eq(_vx, v_reinterpret_as_f64(v_setall_<_TpVec64S>((int64)0x7ff0000000000000)));
+    _TpVec64F mask_nan = v_or(mask_inf, v_ne(x, x));
+    ysin = v_select(mask_nan, v_nan, ysin);
+    ycos = v_select(mask_nan, v_nan, ycos);
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_sin_default_64f(const _TpVec64F &x) {
+    _TpVec64F ysin, ycos;
+    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
+    return ysin;
+}
+
+template<typename _TpVec64F, typename _TpVec64S>
+inline _TpVec64F v_cos_default_64f(const _TpVec64F &x) {
+    _TpVec64F ysin, ycos;
+    v_sincos_default_64f<_TpVec64F, _TpVec64S>(x, ysin, ycos);
+    return ycos;
+}
+//! @}
+
+
+/* This implementation is derived from the approximation approach of Error Function (Erf) from PyTorch
+   https://github.com/pytorch/pytorch/blob/9c50ecc84b9a6e699a7f058891b889aafbf976c7/aten/src/ATen/cpu/vec/vec512/vec512_float.h#L189-L220
+*/
+
+//! @name Error Function
+//! @{
+template<typename _TpVec32F, typename _TpVec32S>
+inline _TpVec32F v_erf_default_32f(const _TpVec32F &v) {
+    const _TpVec32F coef0 = v_setall_<_TpVec32F>(0.3275911f),
+            coef1 = v_setall_<_TpVec32F>(1.061405429f),
+            coef2 = v_setall_<_TpVec32F>(-1.453152027f),
+            coef3 = v_setall_<_TpVec32F>(1.421413741f),
+            coef4 = v_setall_<_TpVec32F>(-0.284496736f),
+            coef5 = v_setall_<_TpVec32F>(0.254829592f),
+            ones = v_setall_<_TpVec32F>(1.0f),
+            neg_zeros = v_setall_<_TpVec32F>(-0.f);
+    _TpVec32F t = v_abs(v);
+    // sign(v)
+    _TpVec32F sign_mask = v_and(neg_zeros, v);
+
+    t = v_div(ones, v_fma(coef0, t, ones));
+    _TpVec32F r = v_fma(coef1, t, coef2);
+    r = v_fma(r, t, coef3);
+    r = v_fma(r, t, coef4);
+    r = v_fma(r, t, coef5);
+    // - v * v
+    _TpVec32F v2 = v_mul(v, v);
+    _TpVec32F mv2 = v_xor(neg_zeros, v2);
+    // - exp(- v * v)
+    _TpVec32F exp = v_exp_default_32f<_TpVec32F, _TpVec32S>(mv2);
+    _TpVec32F neg_exp = v_xor(neg_zeros, exp);
+    _TpVec32F res = v_mul(t, neg_exp);
+    res = v_fma(r, res, ones);
+    return v_xor(sign_mask, res);
+}
+//! @}
+
+#endif // OPENCV_HAL_INTRIN_MATH_HPP
diff --git a/3rdParty/opencv2/core/hal/intrin_msa.hpp b/3rdParty/opencv2/core/hal/intrin_msa.hpp
index 4ac09ed404..98b753c42e 100644
--- a/3rdParty/opencv2/core/hal/intrin_msa.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_msa.hpp
@@ -235,6 +235,8 @@ struct v_float64x2
 #define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
@@ -345,53 +347,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
-OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
-OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_add, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_sub, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_mul, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_div, v_float64x2, msa_divq_f64)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
-inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)  \
 {                                                            \
     _Tpwvec c, d;                                            \
     v_mul_expand(a, b, c, d);                                \
     return v_pack(c, d);                                     \
-}                                                            \
-inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-{a = a * b; return a; }
+}
 
 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -546,13 +541,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(msa_hadd_s64(prod, prod));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 
 //////// Fast Dot Product ////////
@@ -596,10 +591,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
 { return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
-OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
-OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
-OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
-inline _Tpvec operator ~ (const _Tpvec& a) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_and, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_or, _Tpvec, msa_orrq_##suffix)    \
+OPENCV_HAL_IMPL_MSA_BIN_OP(v_xor, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec v_not(const _Tpvec& a) \
 { \
     return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
 }
@@ -614,21 +609,16 @@ OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
 OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
 
 #define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
-OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_and, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_or, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(v_xor, msa_eorq_s32)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not(const v_float32x4& a)
 {
     return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -659,21 +649,16 @@ OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
 OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
 
 #define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
-OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_and, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_or, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(v_xor, msa_eorq_s64)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not(const v_float64x2& a)
 {
     return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
 }
@@ -704,17 +689,17 @@ OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
 OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
 
 #define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
@@ -821,9 +806,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl(const _Tpvec& a, int n) \
 { return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr(const _Tpvec& a, int n) \
 { return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
@@ -1838,7 +1823,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #ifndef msa_ld1_f16
     v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
@@ -1848,7 +1833,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(msa_cvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     v4f16 hv = msa_cvt_f16_f32(v.val);
 
@@ -1859,7 +1844,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float buf[4];
     for( int i = 0; i < 4; i++ )
@@ -1867,17 +1852,31 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float buf[4];
     v_store(buf, v);
     for( int i = 0; i < 4; i++ )
-        ptr[i] = (float16_t)buf[i];
+        ptr[i] = (hfloat)buf[i];
 }
 #endif
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_neon.hpp b/3rdParty/opencv2/core/hal/intrin_neon.hpp
index 17f60996c5..eb10edfc44 100644
--- a/3rdParty/opencv2/core/hal/intrin_neon.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_neon.hpp
@@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
 #define CV_NEON_AARCH64 0
 #endif
 
-// TODO
-#define CV_NEON_DOT 0
 
 //////////// Utils ////////////
 
@@ -133,13 +131,22 @@ OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2,  int64x1,  s64)
 OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
 #endif
 
+//////////// Compatibility layer ////////////
+template<typename T> struct VTraits {
+        static inline int vlanes() { return T::nlanes; }
+        enum { max_nlanes = T::nlanes, nlanes = T::nlanes };
+        using lane_type = typename T::lane_type;
+};
+
+template<typename T>
+inline typename VTraits<T>::lane_type v_get0(const T& v) \
+{ \
+    return v.get0(); \
+}
 //////////// Types ////////////
 
 struct v_uint8x16
 {
-    typedef uchar lane_type;
-    enum { nlanes = 16 };
-
     v_uint8x16() {}
     explicit v_uint8x16(uint8x16_t v) : val(v) {}
     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
@@ -148,19 +155,22 @@ struct v_uint8x16
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_u8(v);
     }
+    uint8x16_t val;
+
+private:
+    friend struct VTraits<v_uint8x16>;
+    enum { nlanes = 16 };
+    typedef uchar lane_type;
+
+    friend typename VTraits<v_uint8x16>::lane_type v_get0<v_uint8x16>(const v_uint8x16& v);
     uchar get0() const
     {
         return vgetq_lane_u8(val, 0);
     }
-
-    uint8x16_t val;
 };
 
 struct v_int8x16
 {
-    typedef schar lane_type;
-    enum { nlanes = 16 };
-
     v_int8x16() {}
     explicit v_int8x16(int8x16_t v) : val(v) {}
     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
@@ -169,19 +179,22 @@ struct v_int8x16
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
         val = vld1q_s8(v);
     }
+    int8x16_t val;
+
+private:
+    friend struct VTraits<v_int8x16>;
+    enum { nlanes = 16 };
+    typedef schar lane_type;
+
+    friend typename VTraits<v_int8x16>::lane_type v_get0<v_int8x16>(const v_int8x16& v);
     schar get0() const
     {
         return vgetq_lane_s8(val, 0);
     }
-
-    int8x16_t val;
 };
 
 struct v_uint16x8
 {
-    typedef ushort lane_type;
-    enum { nlanes = 8 };
-
     v_uint16x8() {}
     explicit v_uint16x8(uint16x8_t v) : val(v) {}
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
@@ -189,19 +202,22 @@ struct v_uint16x8
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_u16(v);
     }
+    uint16x8_t val;
+
+private:
+    friend struct VTraits<v_uint16x8>;
+    enum { nlanes = 8 };
+    typedef ushort lane_type;
+
+    friend typename VTraits<v_uint16x8>::lane_type v_get0<v_uint16x8>(const v_uint16x8& v);
     ushort get0() const
     {
         return vgetq_lane_u16(val, 0);
     }
-
-    uint16x8_t val;
 };
 
 struct v_int16x8
 {
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
     v_int16x8() {}
     explicit v_int16x8(int16x8_t v) : val(v) {}
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
@@ -209,19 +225,22 @@ struct v_int16x8
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
         val = vld1q_s16(v);
     }
+    int16x8_t val;
+
+private:
+    friend struct VTraits<v_int16x8>;
+    enum { nlanes = 8 };
+    typedef short lane_type;
+
+    friend typename VTraits<v_int16x8>::lane_type v_get0<v_int16x8>(const v_int16x8& v);
     short get0() const
     {
         return vgetq_lane_s16(val, 0);
     }
-
-    int16x8_t val;
 };
 
 struct v_uint32x4
 {
-    typedef unsigned lane_type;
-    enum { nlanes = 4 };
-
     v_uint32x4() {}
     explicit v_uint32x4(uint32x4_t v) : val(v) {}
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
@@ -229,19 +248,22 @@ struct v_uint32x4
         unsigned v[] = {v0, v1, v2, v3};
         val = vld1q_u32(v);
     }
+    uint32x4_t val;
+
+private:
+    friend struct VTraits<v_uint32x4>;
+    enum { nlanes = 4 };
+    typedef unsigned lane_type;
+
+    friend typename VTraits<v_uint32x4>::lane_type v_get0<v_uint32x4>(const v_uint32x4& v);
     unsigned get0() const
     {
         return vgetq_lane_u32(val, 0);
     }
-
-    uint32x4_t val;
 };
 
 struct v_int32x4
 {
-    typedef int lane_type;
-    enum { nlanes = 4 };
-
     v_int32x4() {}
     explicit v_int32x4(int32x4_t v) : val(v) {}
     v_int32x4(int v0, int v1, int v2, int v3)
@@ -249,18 +271,22 @@ struct v_int32x4
         int v[] = {v0, v1, v2, v3};
         val = vld1q_s32(v);
     }
+    int32x4_t val;
+
+private:
+    friend struct VTraits<v_int32x4>;
+    enum { nlanes = 4 };
+    typedef int lane_type;
+
+    friend typename VTraits<v_int32x4>::lane_type v_get0<v_int32x4>(const v_int32x4& v);
     int get0() const
     {
         return vgetq_lane_s32(val, 0);
     }
-    int32x4_t val;
 };
 
 struct v_float32x4
 {
-    typedef float lane_type;
-    enum { nlanes = 4 };
-
     v_float32x4() {}
     explicit v_float32x4(float32x4_t v) : val(v) {}
     v_float32x4(float v0, float v1, float v2, float v3)
@@ -268,18 +294,22 @@ struct v_float32x4
         float v[] = {v0, v1, v2, v3};
         val = vld1q_f32(v);
     }
+    float32x4_t val;
+
+private:
+    friend struct VTraits<v_float32x4>;
+    enum { nlanes = 4 };
+    typedef float lane_type;
+
+    friend typename VTraits<v_float32x4>::lane_type v_get0<v_float32x4>(const v_float32x4& v);
     float get0() const
     {
         return vgetq_lane_f32(val, 0);
     }
-    float32x4_t val;
 };
 
 struct v_uint64x2
 {
-    typedef uint64 lane_type;
-    enum { nlanes = 2 };
-
     v_uint64x2() {}
     explicit v_uint64x2(uint64x2_t v) : val(v) {}
     v_uint64x2(uint64 v0, uint64 v1)
@@ -287,18 +317,21 @@ struct v_uint64x2
         uint64 v[] = {v0, v1};
         val = vld1q_u64(v);
     }
+    uint64x2_t val;
+private:
+    friend struct VTraits<v_uint64x2>;
+    enum { nlanes = 2 };
+    typedef uint64 lane_type;
+
+    friend typename VTraits<v_uint64x2>::lane_type v_get0<v_uint64x2>(const v_uint64x2& v);
     uint64 get0() const
     {
         return vgetq_lane_u64(val, 0);
     }
-    uint64x2_t val;
 };
 
 struct v_int64x2
 {
-    typedef int64 lane_type;
-    enum { nlanes = 2 };
-
     v_int64x2() {}
     explicit v_int64x2(int64x2_t v) : val(v) {}
     v_int64x2(int64 v0, int64 v1)
@@ -306,19 +339,23 @@ struct v_int64x2
         int64 v[] = {v0, v1};
         val = vld1q_s64(v);
     }
+    int64x2_t val;
+
+private:
+    friend struct VTraits<v_int64x2>;
+    enum { nlanes = 2 };
+    typedef int64 lane_type;
+
+    friend typename VTraits<v_int64x2>::lane_type v_get0<v_int64x2>(const v_int64x2& v);
     int64 get0() const
     {
         return vgetq_lane_s64(val, 0);
     }
-    int64x2_t val;
 };
 
 #if CV_SIMD128_64F
 struct v_float64x2
 {
-    typedef double lane_type;
-    enum { nlanes = 2 };
-
     v_float64x2() {}
     explicit v_float64x2(float64x2_t v) : val(v) {}
     v_float64x2(double v0, double v1)
@@ -326,17 +363,26 @@ struct v_float64x2
         double v[] = {v0, v1};
         val = vld1q_f64(v);
     }
+
+    float64x2_t val;
+private:
+    friend struct VTraits<v_float64x2>;
+    enum { nlanes = 2 };
+    typedef double lane_type;
+
+    friend typename VTraits<v_float64x2>::lane_type v_get0<v_float64x2>(const v_float64x2& v);
     double get0() const
     {
         return vgetq_lane_f64(val, 0);
     }
-    float64x2_t val;
 };
 #endif
 
 #define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
 inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
 inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
+template <> inline v_##_Tpv v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline v_##_Tpv v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 inline _Tpv##_t vreinterpretq_##suffix##_##suffix(_Tpv##_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(vreinterpretq_u8_##suffix(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(vreinterpretq_s8_##suffix(v.val)); } \
@@ -462,71 +508,56 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint8x16, vqaddq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint8x16, vqsubq_u8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int8x16, vqsubq_s8)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint16x8, vqaddq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint16x8, vqsubq_u16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int16x8, vqaddq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int16x8, vqsubq_s16)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int32x4, vaddq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int32x4, vsubq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_int32x4, vmulq_s32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint32x4, vaddq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint32x4, vsubq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_uint32x4, vmulq_u32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float32x4, vaddq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float32x4, vsubq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float32x4, vmulq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int64x2, vaddq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_int64x2, vsubq_s64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_uint64x2, vaddq_u64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_uint64x2, vsubq_u64)
+}
+
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint8x16, vqaddq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint8x16, vqsubq_u8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int8x16, vqaddq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int8x16, vqsubq_s8)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint16x8, vqaddq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint16x8, vqsubq_u16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int16x8, vqaddq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int16x8, vqsubq_s16)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int32x4, vaddq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int32x4, vsubq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_int32x4, vmulq_s32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint32x4, vaddq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint32x4, vsubq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_uint32x4, vmulq_u32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float32x4, vaddq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float32x4, vsubq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float32x4, vmulq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_int64x2, vaddq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_int64x2, vsubq_s64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_uint64x2, vaddq_u64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_uint64x2, vsubq_u64)
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float32x4, vdivq_f32)
-OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_float64x2, vaddq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(-, v_float64x2, vsubq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(*, v_float64x2, vmulq_f64)
-OPENCV_HAL_IMPL_NEON_BIN_OP(/, v_float64x2, vdivq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float32x4, vdivq_f32)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_add, v_float64x2, vaddq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_sub, v_float64x2, vsubq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_mul, v_float64x2, vmulq_f64)
+OPENCV_HAL_IMPL_NEON_BIN_OP(v_div, v_float64x2, vdivq_f64)
 #else
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_div (const v_float32x4& a, const v_float32x4& b)
 {
     float32x4_t reciprocal = vrecpeq_f32(b.val);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
     return v_float32x4(vmulq_f32(a.val, reciprocal));
 }
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
-{
-    float32x4_t reciprocal = vrecpeq_f32(b.val);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal);
-    a.val = vmulq_f32(a.val, reciprocal);
-    return a;
-}
 #endif
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul (const _Tpvec& a, const _Tpvec& b)  \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_NEON_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -665,11 +696,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
 }
 
 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
+} \
+inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
     const uint8x16_t zero   = vreinterpretq_u8_u32(vdupq_n_u32(0));
     const uint8x16_t mask   = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
     const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
@@ -685,23 +727,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
                               vshrq_n_u32(vreinterpretq_u32_u16(odd),  16));
     return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
-    return v_dotprod_expand(a, b) + c;
-#endif
+    return v_add(v_dotprod_expand(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
     int16x8_t p0  = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
     int16x8_t p1  = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
     int16x8_t uzp1, uzp2;
@@ -710,18 +744,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     int16x4_t uzpl1, uzpl2;
     _v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
     return v_int32x4(vaddl_s16(uzpl1, uzpl2));
-#endif
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                   const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
-    return v_dotprod_expand(a, b) + c;
-#endif
+    return v_add(v_dotprod_expand(a, b), c);
 }
-
+#endif
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
@@ -739,7 +768,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -756,7 +785,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                   const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
@@ -764,7 +793,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                     const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 #endif
 
 //////// Fast Dot Product ////////
@@ -830,45 +859,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
 }
 
 // 8 >> 32
+#ifdef CV_NEON_DOT
+#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b)   \
+{ \
+    return v_dotprod_expand(a, b); \
+} \
+inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
+{ \
+    return v_dotprod_expand(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_uint32x4, v_uint8x16, u32)
+OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(v_int32x4,  v_int8x16,  s32)
+#else
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
-#else
     uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
     uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
     uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
     uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
     return v_uint32x4(vaddq_u32(s0, s1));
-#endif
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
 {
-#if CV_NEON_DOT
-    return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
-#else
-    return v_dotprod_expand_fast(a, b) + c;
-#endif
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
-#else
     int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
     prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
     return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
-#endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
-#if CV_NEON_DOT
-    return v_int32x4(vdotq_s32(c.val, a.val, b.val));
-#else
-    return v_dotprod_expand_fast(a, b) + c;
-#endif
+    return v_add(v_dotprod_expand_fast(a, b), c);
 }
+#endif
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -880,7 +908,7 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     return v_uint64x2(vaddq_u64(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
@@ -889,22 +917,22 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 #if CV_SIMD128_64F
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod_fast(a, b)); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 #endif
 
 
 #define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
-    OPENCV_HAL_IMPL_NEON_BIN_OP(^, _Tpvec, veorq_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_and, _Tpvec, vandq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_or, _Tpvec, vorrq_##suffix) \
+    OPENCV_HAL_IMPL_NEON_BIN_OP(v_xor, _Tpvec, veorq_##suffix) \
+    inline _Tpvec v_not (const _Tpvec& a) \
     { \
         return _Tpvec(vreinterpretq_##suffix##_u8(vmvnq_u8(vreinterpretq_u8_##suffix(a.val)))); \
     }
@@ -919,21 +947,16 @@ OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_uint64x2, u64)
 OPENCV_HAL_IMPL_NEON_LOGIC_OP(v_int64x2, s64)
 
 #define OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op (const v_float32x4& a, const v_float32x4& b) \
 { \
     return v_float32x4(vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val)))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vreinterpretq_f32_s32(intrin(vreinterpretq_s32_f32(a.val), vreinterpretq_s32_f32(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(&, vandq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(|, vorrq_s32)
-OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(^, veorq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_and, vandq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_or, vorrq_s32)
+OPENCV_HAL_IMPL_NEON_FLT_BIT_OP(v_xor, veorq_s32)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not (const v_float32x4& a)
 {
     return v_float32x4(vreinterpretq_f32_s32(vmvnq_s32(vreinterpretq_s32_f32(a.val))));
 }
@@ -947,7 +970,7 @@ inline v_float32x4 v_sqrt(const v_float32x4& x)
 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 {
     v_float32x4 one = v_setall_f32(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 #else
 inline v_float32x4 v_sqrt(const v_float32x4& x)
@@ -980,21 +1003,16 @@ inline v_float32x4 v_abs(v_float32x4 x)
 
 #if CV_SIMD128_64F
 #define OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op (const v_float64x2& a, const v_float64x2& b) \
 { \
     return v_float64x2(vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val)))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a.val = vreinterpretq_f64_s64(intrin(vreinterpretq_s64_f64(a.val), vreinterpretq_s64_f64(b.val))); \
-    return a; \
 }
 
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(&, vandq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(|, vorrq_s64)
-OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(^, veorq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_and, vandq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_or, vorrq_s64)
+OPENCV_HAL_IMPL_NEON_DBL_BIT_OP(v_xor, veorq_s64)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not (const v_float64x2& a)
 {
     return v_float64x2(vreinterpretq_f64_s32(vmvnq_s32(vreinterpretq_s32_f64(a.val))));
 }
@@ -1007,7 +1025,7 @@ inline v_float64x2 v_sqrt(const v_float64x2& x)
 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 {
     v_float64x2 one = v_setall_f64(1.0f);
-    return one / v_sqrt(x);
+    return v_div(one, v_sqrt(x));
 }
 
 inline v_float64x2 v_abs(v_float64x2 x)
@@ -1041,30 +1059,18 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_min, vminq_f64)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_float64x2, v_max, vmaxq_f64)
 #endif
 
-#if CV_SIMD128_64F
-inline int64x2_t vmvnq_s64(int64x2_t a)
-{
-    int64x2_t vx = vreinterpretq_s64_u32(vdupq_n_u32(0xFFFFFFFF));
-    return veorq_s64(a, vx);
-}
-inline uint64x2_t vmvnq_u64(uint64x2_t a)
-{
-    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
-    return veorq_u64(a, vx);
-}
-#endif
 #define OPENCV_HAL_IMPL_NEON_INT_CMP_OP(_Tpvec, cast, suffix, not_suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vceqq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vmvnq_##not_suffix(vceqq_##suffix(a.val, b.val)))); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcltq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgtq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcleq_##suffix(a.val, b.val))); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(cast(vcgeq_##suffix(a.val, b.val))); }
 
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint8x16, OPENCV_HAL_NOP, u8, u8)
@@ -1074,9 +1080,47 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int16x8, vreinterpretq_s16_u16, s16, u16)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint32x4, OPENCV_HAL_NOP, u32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int32x4, vreinterpretq_s32_u32, s32, u32)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float32x4, vreinterpretq_f32_u32, f32, u32)
+#if defined(__aarch64__) || defined(_M_ARM64)
+static inline uint64x2_t vmvnq_u64(uint64x2_t a)
+{
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return veorq_u64(a, vx);
+}
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
+//OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vceqq_u64(a.val, b.val)); }
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
+{ return v_uint64x2(vmvnq_u64(vceqq_u64(a.val, b.val))); }
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vceqq_s64(a.val, b.val))); }
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
+{ return v_int64x2(vreinterpretq_s64_u64(vmvnq_u64(vceqq_s64(a.val, b.val)))); }
+#else
+static inline v_uint64x2 v_eq (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return v_uint64x2(vreinterpretq_u64_u32(vandq_u32(cmp, swapped)));
+}
+static inline v_uint64x2 v_ne (const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_u64(a.val), vreinterpretq_u32_u64(b.val));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    uint64x2_t v_eq = vreinterpretq_u64_u32(vandq_u32(cmp, swapped));
+    uint64x2_t vx = vreinterpretq_u64_u32(vdupq_n_u32(0xFFFFFFFF));
+    return v_uint64x2(veorq_u64(v_eq, vx));
+}
+static inline v_int64x2 v_eq (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_eq(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
+}
+static inline v_int64x2 v_ne (const v_int64x2& a, const v_int64x2& b)
+{
+    return v_reinterpret_as_s64(v_ne(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b)));
+}
+#endif
 #if CV_SIMD128_64F
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_uint64x2, OPENCV_HAL_NOP, u64, u64)
-OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 #endif
 
@@ -1186,9 +1230,9 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 
 // trade efficiency for convenience
 #define OPENCV_HAL_IMPL_NEON_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)n))); } \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr (const _Tpvec& a, int n) \
 { return _Tpvec(vshlq_##suffix(a.val, vdupq_n_##ssuffix((_Tps)-n))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec(vshlq_n_##suffix(a.val, n)); } \
@@ -1210,13 +1254,13 @@ OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int64x2, s64, int64, s64)
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 { return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); } \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, VTraits<_Tpvec>::nlanes - n)); } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { CV_UNUSED(b); return a; }
 
@@ -1600,7 +1644,7 @@ inline int v_signmask(const v_uint64x2& a)
 #if CV_NEON_AARCH64
     const int64x2_t signPosition = {0,1};
     uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), signPosition);
-    uint64_t t0 = vaddvq_u64(v0);
+    int t0 = (int)vaddvq_u64(v0);
     return t0;
 #else // #if CV_NEON_AARCH64
     int64x1_t m0 = vdup_n_s64(0);
@@ -1948,11 +1992,9 @@ inline v_int32x4 v_round(const v_float32x4& a)
 #else
 inline v_int32x4 v_round(const v_float32x4& a)
 {
-    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-
-    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
-    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
+    // See https://github.com/opencv/opencv/pull/24271#issuecomment-1867318007
+    float32x4_t delta = vdupq_n_f32(12582912.0f);
+    return v_int32x4(vcvtq_s32_f32(vsubq_f32(vaddq_f32(a.val, delta), delta)));
 }
 #endif
 inline v_int32x4 v_floor(const v_float32x4& a)
@@ -1976,12 +2018,12 @@ inline v_int32x4 v_trunc(const v_float32x4& a)
 inline v_int32x4 v_round(const v_float64x2& a)
 {
     static const int32x2_t zero = vdup_n_s32(0);
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), zero));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), zero));
 }
 
 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
-    return v_int32x4(vcombine_s32(vmovn_s64(vcvtaq_s64_f64(a.val)), vmovn_s64(vcvtaq_s64_f64(b.val))));
+    return v_int32x4(vcombine_s32(vmovn_s64(vcvtnq_s64_f64(a.val)), vmovn_s64(vcvtnq_s64_f64(b.val))));
 }
 
 inline v_int32x4 v_floor(const v_float64x2& a)
@@ -2565,7 +2607,7 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 
 ////// FP16 support ///////
 #if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float16x4_t v =
     #ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
@@ -2576,7 +2618,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(vcvt_f32_f16(v));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     float16x4_t hv = vcvt_f16_f32(v.val);
 
@@ -2587,7 +2629,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
     #endif
 }
 #else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     const int N = 4;
     float buf[N];
@@ -2595,17 +2637,39 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_load(buf);
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     const int N = 4;
     float buf[N];
     v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
+    for( int i = 0; i < N; i++ ) ptr[i] = hfloat(buf[i]);
 }
 #endif
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+#if defined(CV_SIMD_FP16) && CV_SIMD_FP16
+inline v_float16x8 v_exp(const v_float16x8& x) { return v_exp_default_16f<v_float16x8, v_int16x8>(x); }
+inline v_float16x8 v_log(const v_float16x8& x) { return v_log_default_16f<v_float16x8, v_int16x8>(x); }
+inline void v_sincos(const v_float16x8& x, v_float16x8& s, v_float16x8& c) { v_sincos_default_16f<v_float16x8, v_int16x8>(x, s, c); }
+inline v_float16x8 v_sin(const v_float16x8& x) { return v_sin_default_16f<v_float16x8, v_int16x8>(x); }
+inline v_float16x8 v_cos(const v_float16x8& x) { return v_cos_default_16f<v_float16x8, v_int16x8>(x); }
+#endif
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+#endif
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_rvv.hpp b/3rdParty/opencv2/core/hal/intrin_rvv.hpp
deleted file mode 100644
index f5902e4f77..0000000000
--- a/3rdParty/opencv2/core/hal/intrin_rvv.hpp
+++ /dev/null
@@ -1,3320 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// The original implementation has been contributed by Yin Zhang.
-// Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
-
-#ifndef OPENCV_HAL_INTRIN_RVV_HPP
-#define OPENCV_HAL_INTRIN_RVV_HPP
-
-#include <algorithm>
-
-namespace cv
-{
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
-
-#define CV_SIMD128 1
-#define CV_SIMD128_64F 1
-
-//////////// Unsupported native intrinsics in C++ ////////////
-// The following types have been defined in clang, but not in GCC yet.
-#ifndef __clang__
-
-struct vuint8mf2_t
-{
-    uchar val[8] = {0};
-    vuint8mf2_t() {}
-    vuint8mf2_t(const uchar* ptr)
-    {
-        for (int i = 0; i < 8; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-struct vint8mf2_t
-{
-    schar val[8] = {0};
-    vint8mf2_t() {}
-    vint8mf2_t(const schar* ptr)
-    {
-        for (int i = 0; i < 8; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-struct vuint16mf2_t
-{
-    ushort val[4] = {0};
-    vuint16mf2_t() {}
-    vuint16mf2_t(const ushort* ptr)
-    {
-        for (int i = 0; i < 4; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-struct vint16mf2_t
-{
-    short val[4] = {0};
-    vint16mf2_t() {}
-    vint16mf2_t(const short* ptr)
-    {
-        for (int i = 0; i < 4; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-struct vuint32mf2_t
-{
-    unsigned val[2] = {0};
-    vuint32mf2_t() {}
-    vuint32mf2_t(const unsigned* ptr)
-    {
-        val[0] = ptr[0];
-        val[1] = ptr[1];
-    }
-};
-struct vint32mf2_t
-{
-    int val[2] = {0};
-    vint32mf2_t() {}
-    vint32mf2_t(const int* ptr)
-    {
-        val[0] = ptr[0];
-        val[1] = ptr[1];
-    }
-};
-struct vfloat32mf2_t
-{
-    float val[2] = {0};
-    vfloat32mf2_t() {}
-    vfloat32mf2_t(const float* ptr)
-    {
-        val[0] = ptr[0];
-        val[1] = ptr[1];
-    }
-};
-struct vuint64mf2_t
-{
-    uint64 val[1] = {0};
-    vuint64mf2_t() {}
-    vuint64mf2_t(const uint64* ptr)
-    {
-        val[0] = ptr[0];
-    }
-};
-struct vint64mf2_t
-{
-    int64 val[1] = {0};
-    vint64mf2_t() {}
-    vint64mf2_t(const int64* ptr)
-    {
-        val[0] = ptr[0];
-    }
-};
-struct vfloat64mf2_t
-{
-    double val[1] = {0};
-    vfloat64mf2_t() {}
-    vfloat64mf2_t(const double* ptr)
-    {
-        val[0] = ptr[0];
-    }
-};
-struct vuint8mf4_t
-{
-    uchar val[4] = {0};
-    vuint8mf4_t() {}
-    vuint8mf4_t(const uchar* ptr)
-    {
-        for (int i = 0; i < 4; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-struct vint8mf4_t
-{
-    schar val[4] = {0};
-    vint8mf4_t() {}
-    vint8mf4_t(const schar* ptr)
-    {
-        for (int i = 0; i < 4; ++i)
-        {
-            val[i] = ptr[i];
-        }
-    }
-};
-
-#define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
-inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr, size_t vl) \
-{ \
-    CV_UNUSED(vl); \
-    return _Tpvec(ptr); \
-} \
-inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v, size_t vl) \
-{ \
-    CV_UNUSED(vl); \
-    for (int i = 0; i < n; ++i) \
-    { \
-            ptr[i] = v.val[i]; \
-    } \
-}
-
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
-OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
-
-
-#define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
-inline _Tpwvec wcvt (_Tpvec v, size_t vl) \
-{ \
-    _wTp tmp[n]; \
-    for (int i = 0; i < n; ++i) \
-    { \
-            tmp[i] = (_wTp)v.val[i]; \
-    } \
-    return vle##width##_v_##suffix##m1(tmp, vl); \
-}
-
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
-OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
-
-inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base, size_t vl)
-{
-    CV_UNUSED(vl);
-    return vuint8mf4_t(base);
-}
-inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base, size_t vl)
-{
-    CV_UNUSED(vl);
-    return vint8mf4_t(base);
-}
-
-inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src, size_t vl)
-{
-    ushort tmp[4];
-    for (int i = 0; i < 4; ++i)
-    {
-            tmp[i] = (ushort)src.val[i];
-    }
-    return vle16_v_u16mf2(tmp, vl);
-}
-inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src, size_t vl)
-{
-    short tmp[4];
-    for (int i = 0; i < 4; ++i)
-    {
-            tmp[i] = (short)src.val[i];
-    }
-    return vle16_v_i16mf2(tmp, vl);
-}
-#endif
-
-//////////// Types ////////////
-
-#ifndef __clang__
-struct v_uint8x16
-{
-    typedef uchar lane_type;
-    enum { nlanes = 16 };
-
-    v_uint8x16() {}
-    explicit v_uint8x16(vuint8m1_t v)
-    {
-        vse8_v_u8m1(val, v, nlanes);
-    }
-    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
-               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
-    {
-        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vuint8m1_t() const
-    {
-        return vle8_v_u8m1(val, nlanes);
-    }
-    uchar get0() const
-    {
-        return val[0];
-    }
-
-    uchar val[16];
-};
-
-struct v_int8x16
-{
-    typedef schar lane_type;
-    enum { nlanes = 16 };
-
-    v_int8x16() {}
-    explicit v_int8x16(vint8m1_t v)
-    {
-        vse8_v_i8m1(val, v, nlanes);
-    }
-    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
-               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
-    {
-        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vint8m1_t() const
-    {
-        return vle8_v_i8m1(val, nlanes);
-    }
-    schar get0() const
-    {
-        return val[0];
-    }
-
-    schar val[16];
-};
-
-struct v_uint16x8
-{
-    typedef ushort lane_type;
-    enum { nlanes = 8 };
-
-    v_uint16x8() {}
-    explicit v_uint16x8(vuint16m1_t v)
-    {
-        vse16_v_u16m1(val, v, nlanes);
-    }
-    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
-    {
-        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vuint16m1_t() const
-    {
-        return vle16_v_u16m1(val, nlanes);
-    }
-    ushort get0() const
-    {
-        return val[0];
-    }
-
-    ushort val[8];
-};
-
-struct v_int16x8
-{
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
-    v_int16x8() {}
-    explicit v_int16x8(vint16m1_t v)
-    {
-        vse16_v_i16m1(val, v, nlanes);
-    }
-    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vint16m1_t() const
-    {
-        return vle16_v_i16m1(val, nlanes);
-    }
-    short get0() const
-    {
-        return val[0];
-    }
-
-    short val[8];
-};
-
-struct v_uint32x4
-{
-    typedef unsigned lane_type;
-    enum { nlanes = 4 };
-
-    v_uint32x4() {}
-    explicit v_uint32x4(vuint32m1_t v)
-    {
-        vse32_v_u32m1(val, v, nlanes);
-    }
-    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
-    {
-        unsigned v[] = {v0, v1, v2, v3};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vuint32m1_t() const
-    {
-        return vle32_v_u32m1(val, nlanes);
-    }
-    unsigned get0() const
-    {
-        return val[0];
-    }
-
-    unsigned val[4];
-};
-
-struct v_int32x4
-{
-    typedef int lane_type;
-    enum { nlanes = 4 };
-
-    v_int32x4() {}
-    explicit v_int32x4(vint32m1_t v)
-    {
-        vse32_v_i32m1(val, v, nlanes);
-    }
-    v_int32x4(int v0, int v1, int v2, int v3)
-    {
-        int v[] = {v0, v1, v2, v3};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vint32m1_t() const
-    {
-        return vle32_v_i32m1(val, nlanes);
-    }
-    int get0() const
-    {
-        return val[0];
-    }
-    int val[4];
-};
-
-struct v_float32x4
-{
-    typedef float lane_type;
-    enum { nlanes = 4 };
-
-    v_float32x4() {}
-    explicit v_float32x4(vfloat32m1_t v)
-    {
-        vse32_v_f32m1(val, v, nlanes);
-    }
-    v_float32x4(float v0, float v1, float v2, float v3)
-    {
-        float v[] = {v0, v1, v2, v3};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vfloat32m1_t() const
-    {
-        return vle32_v_f32m1(val, nlanes);
-    }
-    float get0() const
-    {
-        return val[0];
-    }
-    float val[4];
-};
-
-struct v_uint64x2
-{
-    typedef uint64 lane_type;
-    enum { nlanes = 2 };
-
-    v_uint64x2() {}
-    explicit v_uint64x2(vuint64m1_t v)
-    {
-        vse64_v_u64m1(val, v, nlanes);
-    }
-    v_uint64x2(uint64 v0, uint64 v1)
-    {
-        uint64 v[] = {v0, v1};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vuint64m1_t() const
-    {
-        return vle64_v_u64m1(val, nlanes);
-    }
-    uint64 get0() const
-    {
-        return val[0];
-    }
-
-    uint64 val[2];
-};
-
-struct v_int64x2
-{
-    typedef int64 lane_type;
-    enum { nlanes = 2 };
-
-    v_int64x2() {}
-    explicit v_int64x2(vint64m1_t v)
-    {
-        vse64_v_i64m1(val, v, nlanes);
-    }
-    v_int64x2(int64 v0, int64 v1)
-    {
-        int64 v[] = {v0, v1};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vint64m1_t() const
-    {
-        return vle64_v_i64m1(val, nlanes);
-    }
-    int64 get0() const
-    {
-        return val[0];
-    }
-
-    int64 val[2];
-};
-
-#if CV_SIMD128_64F
-struct v_float64x2
-{
-    typedef double lane_type;
-    enum { nlanes = 2 };
-
-    v_float64x2() {}
-    explicit v_float64x2(vfloat64m1_t v)
-    {
-        vse64_v_f64m1(val, v, nlanes);
-    }
-    v_float64x2(double v0, double v1)
-    {
-        double v[] = {v0, v1};
-        for (int i = 0; i < nlanes; ++i)
-        {
-            val[i] = v[i];
-        }
-    }
-    operator vfloat64m1_t() const
-    {
-        return vle64_v_f64m1(val, nlanes);
-    }
-    double get0() const
-    {
-        return val[0];
-    }
-
-    double val[2];
-};
-#endif
-#else
-struct v_uint8x16
-{
-    typedef uchar lane_type;
-    enum { nlanes = 16 };
-
-    v_uint8x16() {}
-    explicit v_uint8x16(vuint8m1_t v)
-    {
-        *pval = v;
-    }
-    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
-               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
-    {
-        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        *pval = vle8_v_u8m1(v, nlanes);
-    }
-    operator vuint8m1_t() const
-    {
-        return *pval;
-    }
-    uchar get0() const
-    {
-        return vmv_x(*pval);
-    }
-    inline v_uint8x16& operator=(const v_uint8x16& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_uint8x16(const v_uint8x16& vec) {
-        *pval = *(vec.pval);
-    }
-    uchar val[16];
-    vuint8m1_t* pval = (vuint8m1_t*)val;
-};
-
-struct v_int8x16
-{
-    typedef schar lane_type;
-    enum { nlanes = 16 };
-
-    v_int8x16() {}
-    explicit v_int8x16(vint8m1_t v)
-    {
-        *pval = v;
-    }
-    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
-               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
-    {
-        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        *pval = vle8_v_i8m1(v, nlanes);
-    }
-    operator vint8m1_t() const
-    {
-        return *pval;
-    }
-    schar get0() const
-    {
-        return vmv_x(*pval);
-    }
-    inline v_int8x16& operator=(const v_int8x16& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_int8x16(const v_int8x16& vec) {
-        *pval = *(vec.pval);
-    }
-    schar val[16];
-    vint8m1_t* pval = (vint8m1_t*)val;
-};
-
-struct v_uint16x8
-{
-    typedef ushort lane_type;
-    enum { nlanes = 8 };
-
-    v_uint16x8() {}
-    explicit v_uint16x8(vuint16m1_t v)
-    {
-        *pval = v;
-    }
-    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
-    {
-        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        *pval = vle16_v_u16m1(v, nlanes);
-    }
-    operator vuint16m1_t() const
-    {
-        return *pval;
-    }
-    ushort get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_uint16x8& operator=(const v_uint16x8& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_uint16x8(const v_uint16x8& vec) {
-        *pval = *(vec.pval);
-    }
-    ushort val[8];
-    vuint16m1_t* pval = (vuint16m1_t*)val;
-};
-
-struct v_int16x8
-{
-    typedef short lane_type;
-    enum { nlanes = 8 };
-
-    v_int16x8() {}
-    explicit v_int16x8(vint16m1_t v)
-    {
-        *pval = v;
-    }
-    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
-    {
-        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        *pval = vle16_v_i16m1(v, nlanes);
-    }
-    operator vint16m1_t() const
-    {
-        return *pval;
-    }
-    short get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_int16x8& operator=(const v_int16x8& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_int16x8(const v_int16x8& vec) {
-        *pval = *(vec.pval);
-    }
-    short val[8];
-    vint16m1_t* pval = (vint16m1_t*)val;
-};
-
-struct v_uint32x4
-{
-    typedef unsigned lane_type;
-    enum { nlanes = 4 };
-
-    v_uint32x4() {}
-    explicit v_uint32x4(vuint32m1_t v)
-    {
-        *pval = v;
-    }
-    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
-    {
-        unsigned v[] = {v0, v1, v2, v3};
-        *pval = vle32_v_u32m1(v, nlanes);
-    }
-    operator vuint32m1_t() const
-    {
-        return *pval;
-    }
-    unsigned get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_uint32x4& operator=(const v_uint32x4& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_uint32x4(const v_uint32x4& vec) {
-        *pval = *(vec.pval);
-    }
-    unsigned val[4];
-    vuint32m1_t* pval = (vuint32m1_t*)val;
-};
-
-struct v_int32x4
-{
-    typedef int lane_type;
-    enum { nlanes = 4 };
-
-    v_int32x4() {}
-    explicit v_int32x4(vint32m1_t v)
-    {
-        *pval = v;
-    }
-    v_int32x4(int v0, int v1, int v2, int v3)
-    {
-        int v[] = {v0, v1, v2, v3};
-        *pval = vle32_v_i32m1(v, nlanes);
-    }
-    operator vint32m1_t() const
-    {
-        return *pval;
-    }
-    int get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_int32x4& operator=(const v_int32x4& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_int32x4(const v_int32x4& vec) {
-        *pval = *(vec.pval);
-    }
-    int val[4];
-    vint32m1_t* pval = (vint32m1_t*)val;
-};
-
-struct v_float32x4
-{
-    typedef float lane_type;
-    enum { nlanes = 4 };
-
-    v_float32x4() {}
-    explicit v_float32x4(vfloat32m1_t v)
-    {
-        *pval = v;
-    }
-    v_float32x4(float v0, float v1, float v2, float v3)
-    {
-        float v[] = {v0, v1, v2, v3};
-        *pval = vle32_v_f32m1(v, nlanes);
-    }
-    operator vfloat32m1_t() const
-    {
-        return *pval;
-    }
-    float get0() const
-    {
-        return vfmv_f(*pval);
-    }
-    inline v_float32x4& operator=(const v_float32x4& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_float32x4(const v_float32x4& vec) {
-        *pval = *(vec.pval);
-    }
-    float val[4];
-    vfloat32m1_t* pval = (vfloat32m1_t*)val;
-};
-
-struct v_uint64x2
-{
-    typedef uint64 lane_type;
-    enum { nlanes = 2 };
-
-    v_uint64x2() {}
-    explicit v_uint64x2(vuint64m1_t v)
-    {
-        *pval = v;
-    }
-    v_uint64x2(uint64 v0, uint64 v1)
-    {
-        uint64 v[] = {v0, v1};
-        *pval = vle64_v_u64m1(v, nlanes);
-    }
-    operator vuint64m1_t() const
-    {
-        return *pval;
-    }
-    uint64 get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_uint64x2& operator=(const v_uint64x2& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_uint64x2(const v_uint64x2& vec) {
-        *pval = *(vec.pval);
-    }
-    uint64 val[2];
-    vuint64m1_t* pval = (vuint64m1_t*)val;
-};
-
-struct v_int64x2
-{
-    typedef int64 lane_type;
-    enum { nlanes = 2 };
-
-    v_int64x2() {}
-    explicit v_int64x2(vint64m1_t v)
-    {
-        *pval = v;
-    }
-    v_int64x2(int64 v0, int64 v1)
-    {
-        int64 v[] = {v0, v1};
-        *pval = vle64_v_i64m1(v, nlanes);
-    }
-    operator vint64m1_t() const
-    {
-        return *pval;
-    }
-    int64 get0() const
-    {
-        return vmv_x(*pval);
-    }
-
-    inline v_int64x2& operator=(const v_int64x2& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_int64x2(const v_int64x2& vec) {
-        *pval = *(vec.pval);
-    }
-    int64 val[2];
-    vint64m1_t* pval = (vint64m1_t*)val;
-};
-
-#if CV_SIMD128_64F
-struct v_float64x2
-{
-    typedef double lane_type;
-    enum { nlanes = 2 };
-
-    v_float64x2() {}
-    explicit v_float64x2(vfloat64m1_t v)
-    {
-        *pval = v;
-    }
-    v_float64x2(double v0, double v1)
-    {
-        double v[] = {v0, v1};
-        *pval = vle64_v_f64m1(v, nlanes);
-    }
-    operator vfloat64m1_t() const
-    {
-        return *pval;
-    }
-    double get0() const
-    {
-        return vfmv_f(*pval);
-    }
-
-    inline v_float64x2& operator=(const v_float64x2& vec) {
-        *pval = *(vec.pval);
-        return *this;
-    }
-    inline v_float64x2(const v_float64x2& vec) {
-        *pval = *(vec.pval);
-    }
-    double val[2];
-    vfloat64m1_t* pval = (vfloat64m1_t*)val;
-};
-#endif // CV_SIMD128_64F
-#endif // __clang__
-
-//////////// Initial ////////////
-
-#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
-inline v_##_Tpvec v_setzero_##suffix1() \
-{ \
-    return v_##_Tpvec(vmv_v_x_##suffix2##m1(0, vl)); \
-} \
-inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
-{ \
-    return v_##_Tpvec(vmv_v_x_##suffix2##m1(v, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, u8, u8, 16)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, s8, i8, 16)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, u16, u16, 8)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, s16, i16, 8)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, u32, u32, 4)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, s32, i32, 4)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, u64, u64, 2)
-OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, s64, i64, 2)
-
-#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
-inline v_##_Tpv v_setzero_##suffix() \
-{ \
-    return v_##_Tpv(vfmv_v_f_##suffix##m1(0, vl)); \
-} \
-inline v_##_Tpv v_setall_##suffix(_Tp v) \
-{ \
-    return v_##_Tpv(vfmv_v_f_##suffix##m1(v, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, f32, 4)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, f64, 2)
-#endif
-
-//////////// Reinterpret ////////////
-
-#define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
-inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
-
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
-#endif
-
-#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
-inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
-{ \
-    return v_##_Tpvec1(vreinterpret_v_##nsuffix2##m1_##nsuffix1##m1(v));\
-} \
-inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
-{ \
-    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##m1_##nsuffix2##m1(v));\
-}
-
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, int8x16, u8, s8, u8, i8)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, int16x8, u16, s16, u16, i16)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, int32x4, u32, s32, u32, i32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, float32x4, u32, f32, u32, f32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, float32x4, s32, f32, i32, f32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, int64x2, u64, s64, u64, i64)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64x2, float64x2, u64, f64, u64, f64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64x2, float64x2, s64, f64, i64, f64)
-#endif
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint16x8, u8, u16, u8, u16)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint32x4, u8, u32, u8, u32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8x16, uint64x2, u8, u64, u8, u64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint32x4, u16, u32, u16, u32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16x8, uint64x2, u16, u64, u16, u64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32x4, uint64x2, u32, u64, u32, u64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int16x8, s8, s16, i8, i16)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int32x4, s8, s32, i8, i32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8x16, int64x2, s8, s64, i8, i64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int32x4, s16, s32, i16, i32)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16x8, int64x2, s16, s64, i16, i64)
-OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32x4, int64x2, s32, s64, i32, i64)
-
-
-#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
-inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
-{ \
-    return v_##_Tpvec1(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix1##width1##m1(vreinterpret_v_##nsuffix2##width2##m1_##nsuffix1##width2##m1(v)));\
-} \
-inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
-{ \
-    return v_##_Tpvec2(vreinterpret_v_##nsuffix1##width2##m1_##nsuffix2##width2##m1(vreinterpret_v_##nsuffix1##width1##m1_##nsuffix1##width2##m1(v)));\
-}
-
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int16x8, u8, s16, u, i, 8, 16)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int32x4, u8, s32, u, i, 8, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, int64x2, u8, s64, u, i, 8, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int8x16, u16, s8, u, i, 16, 8)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int32x4, u16, s32, u, i, 16, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, int64x2, u16, s64, u, i, 16, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int8x16, u32, s8, u, i, 32, 8)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int16x8, u32, s16, u, i, 32, 16)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, int64x2, u32, s64, u, i, 32, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int8x16, u64, s8, u, i, 64, 8)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int16x8, u64, s16, u, i, 64, 16)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, int32x4, u64, s32, u, i, 64, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float32x4, u8, f32, u, f, 8, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float32x4, u16, f32, u, f, 16, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64x2, float32x4, u64, f32, u, f, 64, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float32x4, s8, f32, i, f, 8, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float32x4, s16, f32, i, f, 16, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64x2, float32x4, s64, f32, i, f, 64, 32)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8x16, float64x2, u8, f64, u, f, 8, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16x8, float64x2, u16, f64, u, f, 16, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32x4, float64x2, u32, f64, u, f, 32, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8x16, float64x2, s8, f64, i, f, 8, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16x8, float64x2, s16, f64, i, f, 16, 64)
-OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32x4, float64x2, s32, f64, i, f, 32, 64)
-
-// Three times reinterpret
-inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) \
-{ \
-    return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v))));\
-} \
-inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) \
-{ \
-    return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v))));\
-}
-
-////////////// Extract //////////////
-
-#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, suffix, vmv, vl) \
-template <int s> \
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
-} \
-template<int i> inline _Tp v_extract_n(_Tpvec v) \
-{ \
-    return _Tp(vmv(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), v, i, vl))); \
-}
-
-
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8x16, uchar, u8, vmv_x_s_u8m1_u8, 16)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8x16, schar, i8, vmv_x_s_i8m1_i8, 16)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16x8, ushort, u16, vmv_x_s_u16m1_u16, 8)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16x8, short, i16, vmv_x_s_i16m1_i16, 8)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32x4, uint, u32, vmv_x_s_u32m1_u32, 4)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32x4, int, i32, vmv_x_s_i32m1_i32, 4)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64x2, uint64, u64, vmv_x_s_u64m1_u64, 2)
-OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64x2, int64, i64, vmv_x_s_i64m1_i64, 2)
-
-#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, suffix, vmv, vl) \
-template <int s> \
-inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, s, vl), b, _Tpvec::nlanes - s, vl)); \
-} \
-template<int i> inline _Tp v_extract_n(_Tpvec v) \
-{ \
-    return _Tp(vmv(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), v, i, vl))); \
-}
-
-OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32x4, float, f32, vfmv_f_s_f32m1_f32, 4)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64x2, double, f64, vfmv_f_s_f64m1_f64, 2)
-#endif
-
-////////////// Load/Store //////////////
-
-#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix, vmv) \
-inline _Tpvec v_load(const _Tp* ptr) \
-{ \
-    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
-} \
-inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ \
-    return _Tpvec(vle##width##_v_##suffix##m1(ptr, vl)); \
-} \
-inline _Tpvec v_load_low(const _Tp* ptr) \
-{ \
-    _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
-    return res; \
-} \
-inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, a, vl); \
-} \
-inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, a, vl); \
-} \
-inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, a, vl); \
-} \
-inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, a, vl); \
-} \
-inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, a, hvl); \
-} \
-inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
-{ \
-    vse##width##_v_##suffix##m1(ptr, vslidedown_vx_##suffix##m1(vmv(0, vl), a, hvl, vl), hvl); \
-}
-
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 16, 8, u8, vmv_v_x_u8m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 16, 8, i8, vmv_v_x_i8m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 8, 16, u16, vmv_v_x_u16m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 8, 16, i16, vmv_v_x_i16m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 4, 32, u32, vmv_v_x_u32m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 4, 32, i32, vmv_v_x_i32m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 2, 64, u64, vmv_v_x_u64m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 2, 64, i64, vmv_v_x_i64m1)
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 4, 32, f32, vfmv_v_f_f32m1)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 2, 64, f64, vfmv_v_f_f64m1)
-#endif
-
-inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
-{
-    schar elems[16] =
-    {
-        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
-        ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
-    };
-    return v_int8x16(vle8_v_i8m1(elems, 16));
-}
-inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
-
-inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
-{
-    short elems[8] =
-    {
-        ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
-    };
-    return v_int16x8(vle16_v_i16m1(elems, 8));
-}
-inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
-
-inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
-{
-    int elems[4] =
-    {
-        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
-    };
-    return v_int32x4(vle32_v_i32m1(elems, 4));
-}
-inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
-{
-    float elems[4] =
-    {
-        ptr0[0], ptr0[1], ptr1[0], ptr1[1]
-    };
-    return v_float32x4(vle32_v_f32m1(elems, 4));
-}
-inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
-
-inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
-{
-    int64 elems[2] =
-    {
-        ptr0[0], ptr1[0]
-    };
-    return v_int64x2(vle64_v_i64m1(elems, 2));
-}
-inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
-{
-    double elems[2] =
-    {
-        ptr0[0], ptr1[0]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-#endif
-
-
-////////////// Lookup table access ////////////////////
-
-inline v_int8x16 v_lut(const schar* tab, const int* idx)
-{
-    schar elems[16] =
-    {
-        tab[idx[ 0]],
-        tab[idx[ 1]],
-        tab[idx[ 2]],
-        tab[idx[ 3]],
-        tab[idx[ 4]],
-        tab[idx[ 5]],
-        tab[idx[ 6]],
-        tab[idx[ 7]],
-        tab[idx[ 8]],
-        tab[idx[ 9]],
-        tab[idx[10]],
-        tab[idx[11]],
-        tab[idx[12]],
-        tab[idx[13]],
-        tab[idx[14]],
-        tab[idx[15]]
-    };
-    return v_int8x16(vle8_v_i8m1(elems, 16));
-}
-inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
-{
-    schar elems[16] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[1]],
-        tab[idx[1] + 1],
-        tab[idx[2]],
-        tab[idx[2] + 1],
-        tab[idx[3]],
-        tab[idx[3] + 1],
-        tab[idx[4]],
-        tab[idx[4] + 1],
-        tab[idx[5]],
-        tab[idx[5] + 1],
-        tab[idx[6]],
-        tab[idx[6] + 1],
-        tab[idx[7]],
-        tab[idx[7] + 1]
-    };
-    return v_int8x16(vle8_v_i8m1(elems, 16));
-}
-inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
-{
-    schar elems[16] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[0] + 2],
-        tab[idx[0] + 3],
-        tab[idx[1]],
-        tab[idx[1] + 1],
-        tab[idx[1] + 2],
-        tab[idx[1] + 3],
-        tab[idx[2]],
-        tab[idx[2] + 1],
-        tab[idx[2] + 2],
-        tab[idx[2] + 3],
-        tab[idx[3]],
-        tab[idx[3] + 1],
-        tab[idx[3] + 2],
-        tab[idx[3] + 3]
-    };
-    return v_int8x16(vle8_v_i8m1(elems, 16));
-}
-inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
-inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
-inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
-
-inline v_int16x8 v_lut(const short* tab, const int* idx)
-{
-    short elems[8] =
-    {
-        tab[idx[0]],
-        tab[idx[1]],
-        tab[idx[2]],
-        tab[idx[3]],
-        tab[idx[4]],
-        tab[idx[5]],
-        tab[idx[6]],
-        tab[idx[7]]
-    };
-    return v_int16x8(vle16_v_i16m1(elems, 8));
-}
-inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
-{
-    short elems[8] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[1]],
-        tab[idx[1] + 1],
-        tab[idx[2]],
-        tab[idx[2] + 1],
-        tab[idx[3]],
-        tab[idx[3] + 1]
-    };
-    return v_int16x8(vle16_v_i16m1(elems, 8));
-}
-inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
-{
-    short elems[8] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[0] + 2],
-        tab[idx[0] + 3],
-        tab[idx[1]],
-        tab[idx[1] + 1],
-        tab[idx[1] + 2],
-        tab[idx[1] + 3]
-    };
-    return v_int16x8(vle16_v_i16m1(elems, 8));
-}
-inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
-inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
-inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
-
-inline v_int32x4 v_lut(const int* tab, const int* idx)
-{
-    int elems[4] =
-    {
-        tab[idx[0]],
-        tab[idx[1]],
-        tab[idx[2]],
-        tab[idx[3]]
-    };
-    return v_int32x4(vle32_v_i32m1(elems, 4));
-}
-inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
-{
-    int elems[4] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[1]],
-        tab[idx[1] + 1]
-    };
-    return v_int32x4(vle32_v_i32m1(elems, 4));
-}
-inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
-{
-    return v_int32x4(vle32_v_i32m1(tab + idx[0], 4));
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
-inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
-inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
-
-inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
-{
-    int64_t elems[2] =
-    {
-        tab[idx[0]],
-        tab[idx[1]]
-    };
-    return v_int64x2(vle64_v_i64m1(elems, 2));
-}
-inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
-{
-    return v_int64x2(vle64_v_i64m1(tab + idx[0], 2));
-}
-inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
-inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
-
-inline v_float32x4 v_lut(const float* tab, const int* idx)
-{
-    float elems[4] =
-    {
-        tab[idx[0]],
-        tab[idx[1]],
-        tab[idx[2]],
-        tab[idx[3]]
-    };
-    return v_float32x4(vle32_v_f32m1(elems, 4));
-}
-inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
-{
-    float elems[4] =
-    {
-        tab[idx[0]],
-        tab[idx[0] + 1],
-        tab[idx[1]],
-        tab[idx[1] + 1]
-    };
-    return v_float32x4(vle32_v_f32m1(elems, 4));
-}
-inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
-{
-    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
-}
-
-inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
-{
-    int elems[4] =
-    {
-        tab[v_extract_n<0>(idxvec)],
-        tab[v_extract_n<1>(idxvec)],
-        tab[v_extract_n<2>(idxvec)],
-        tab[v_extract_n<3>(idxvec)]
-    };
-    return v_int32x4(vle32_v_i32m1(elems, 4));
-}
-
-inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
-{
-    unsigned elems[4] =
-    {
-        tab[v_extract_n<0>(idxvec)],
-        tab[v_extract_n<1>(idxvec)],
-        tab[v_extract_n<2>(idxvec)],
-        tab[v_extract_n<3>(idxvec)]
-    };
-    return v_uint32x4(vle32_v_u32m1(elems, 4));
-}
-
-inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
-{
-    float elems[4] =
-    {
-        tab[v_extract_n<0>(idxvec)],
-        tab[v_extract_n<1>(idxvec)],
-        tab[v_extract_n<2>(idxvec)],
-        tab[v_extract_n<3>(idxvec)]
-    };
-    return v_float32x4(vle32_v_f32m1(elems, 4));
-}
-
-inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
-{
-    int idx[4];
-    v_store_aligned(idx, idxvec);
-
-    x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
-    y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
-}
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_lut(const double* tab, const int* idx)
-{
-    double elems[2] =
-    {
-        tab[idx[0]],
-        tab[idx[1]]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
-{
-    return v_float64x2(vle64_v_f64m1(tab + idx[0], 2));
-}
-
-inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
-{
-    double elems[2] =
-    {
-        tab[v_extract_n<0>(idxvec)],
-        tab[v_extract_n<1>(idxvec)]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
-{
-    int idx[4] = {0};
-    v_store_aligned(idx, idxvec);
-
-    x = v_float64x2(tab[idx[0]], tab[idx[1]]);
-    y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
-}
-#endif
-
-////////////// Pack boolean ////////////////////
-
-inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
-{
-    ushort ptr[16] = {0};
-    v_store(ptr, a);
-    v_store(ptr + 8, b);
-    return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr, 16), 0, 16));
-}
-
-inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
-                           const v_uint32x4& c, const v_uint32x4& d)
-{
-    unsigned ptr[16] = {0};
-    v_store(ptr, a);
-    v_store(ptr + 4, b);
-    v_store(ptr + 8, c);
-    v_store(ptr + 12, d);
-    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr, 16), 0, 16), 0, 16));
-}
-
-inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
-                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
-                           const v_uint64x2& g, const v_uint64x2& h)
-{
-    uint64 ptr[16] = {0};
-    v_store(ptr, a);
-    v_store(ptr + 2, b);
-    v_store(ptr + 4, c);
-    v_store(ptr + 6, d);
-    v_store(ptr + 8, e);
-    v_store(ptr + 10, f);
-    v_store(ptr + 12, g);
-    v_store(ptr + 14, h);
-    return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr, 16), 0, 16), 0, 16), 0, 16));
-}
-
-////////////// Arithmetics //////////////
-#define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, vl) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(intrin(a, b, vl)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a = _Tpvec(intrin(a, b, vl)); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 2)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 2)
-#endif
-
-
-////////////// Bitwise logic //////////////
-
-#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, vl) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, vl) \
-OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, vl) \
-inline _Tpvec operator ~ (const _Tpvec& a) \
-{ \
-    return _Tpvec(vnot_v_##suffix##m1(a, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 16)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 16)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 8)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 8)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 4)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 4)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 2)
-OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 2)
-
-#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
-{ \
-    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b), 4))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
-OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
-
-inline v_float32x4 operator ~ (const v_float32x4& a)
-{
-    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a), 4)));
-}
-
-#if CV_SIMD128_64F
-#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
-{ \
-    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
-{ \
-    a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b), 2))); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
-OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
-OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
-
-inline v_float64x2 operator ~ (const v_float64x2& a)
-{
-    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a), 2)));
-}
-#endif
-
-////////////// Bitwise shifts //////////////
-
-#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
-{ \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
-{ \
-    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ \
-    return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n), vl)); \
-}
-
-#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, vl) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
-{ \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
-{ \
-    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
-{ \
-    return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n), vl)); \
-} \
-template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
-{ \
-    return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n), vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 16)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 8)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 4)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 2)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 16)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 8)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 4)
-OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 2)
-
-
-////////////// Comparison //////////////
-
-#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
-inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
-{ \
-    uint64_t ones = -1; \
-    return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b, vl), vmv_v_x_##suffix##m1(0, vl), ones, vl)); \
-}
-
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, vl) \
-inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
-{ \
-    union { uint64 u; double d; } ones; ones.u = -1; \
-    return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b, vl), vfmv_v_f_##suffix##m1(0, vl), ones.d, vl)); \
-}
-
-#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, vl)
-
-#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, vl)
-
-#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, vl) \
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, vl)
-
-
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8, 16)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16, 8)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32, 4)
-OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64, 2)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8, 16)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16, 8)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32, 4)
-OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64, 2)
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32, 4)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64, 2)
-#endif
-
-inline v_float32x4 v_not_nan(const v_float32x4& a)
-{ return a == a; }
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_not_nan(const v_float64x2& a)
-{ return a == a; }
-#endif
-
-////////////// Min/Max //////////////
-
-#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
-inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(intrin(a, b, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 2)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
-#endif
-
-////////////// Arithmetics wrap //////////////
-
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
-
-////////////// Reduce //////////////
-
-#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
-inline scalartype v_reduce_sum(const _Tpvec& a)  \
-{ \
-    _nwTpvec zero = vmv_v_x_##wsuffix##m1(0, vl); \
-    _nwTpvec res = vmv_v_x_##wsuffix##m1(0, vl); \
-    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
-    return (scalartype)(_wTpvec(res).get0()); \
-}
-
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 8, wredsumu)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 8, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 4, wredsumu)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 4, wredsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 2, redsum)
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 2, redsum)
-
-#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, vl, red) \
-inline scalartype v_reduce_sum(const _Tpvec& a)  \
-{ \
-    _nwTpvec zero = vfmv_v_f_##wsuffix##m1(0, vl); \
-    _nwTpvec res = vfmv_v_f_##wsuffix##m1(0, vl); \
-    res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero, vl); \
-    return (scalartype)(_wTpvec(res).get0()); \
-}
-
-// vfredsum for float has renamed to fredosum, also updated in GNU.
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 4, fredosum)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 2, fredosum)
-#endif
-
-
-#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, vl, red) \
-inline scalartype v_reduce_##func(const _Tpvec& a)  \
-{ \
-    _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a, vl)); \
-    return scalartype(res.get0()); \
-}
-
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 16, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 16, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 8, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 8, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 4, redminu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 4, redmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 4, fredmin)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 16, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 16, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 8, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 8, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 4, redmaxu)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 4, redmax)
-OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 4, fredmax)
-
-
-inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
-                                 const v_float32x4& c, const v_float32x4& d)
-{
-    float elems[4] =
-    {
-        v_reduce_sum(a),
-        v_reduce_sum(b),
-        v_reduce_sum(c),
-        v_reduce_sum(d)
-    };
-    return v_float32x4(vle32_v_f32m1(elems, 4));
-}
-
-////////////// Square-Root //////////////
-
-inline v_float32x4 v_sqrt(const v_float32x4& x)
-{
-    return v_float32x4(vfsqrt_v_f32m1(x, 4));
-}
-
-inline v_float32x4 v_invsqrt(const v_float32x4& x)
-{
-    v_float32x4 one = v_setall_f32(1.0f);
-    return one / v_sqrt(x);
-}
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_sqrt(const v_float64x2& x)
-{
-    return v_float64x2(vfsqrt_v_f64m1(x, 4));
-}
-
-inline v_float64x2 v_invsqrt(const v_float64x2& x)
-{
-    v_float64x2 one = v_setall_f64(1.0f);
-    return one / v_sqrt(x);
-}
-#endif
-
-inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
-{
-    v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
-    return v_sqrt(x);
-}
-
-inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a, 4), b, b, 4));
-}
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
-{
-    v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
-    return v_sqrt(x);
-}
-
-inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
-{
-    return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a, 2), b, b, 2));
-}
-#endif
-
-////////////// Multiply-Add //////////////
-
-inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
-{
-    return v_float32x4(vfmacc_vv_f32m1(c, a, b, 4));
-}
-inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
-{
-    return v_int32x4(vmacc_vv_i32m1(c, a, b, 4));
-}
-
-inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
-{
-    return v_fma(a, b, c);
-}
-
-inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
-{
-    return v_fma(a, b, c);
-}
-
-#if CV_SIMD128_64F
-inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
-{
-    return v_float64x2(vfmacc_vv_f64m1(c, a, b, 2));
-}
-
-inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
-{
-    return v_fma(a, b, c);
-}
-#endif
-
-////////////// Check all/any //////////////
-
-// use overloaded vcpop in clang, no casting like (vuint64m1_t) is needed.
-#ifndef __clang__
-#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, vl) \
-inline bool v_check_all(const _Tpvec& a) \
-{ \
-    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a, vl), shift, vl)); \
-    return (v.val[0] | v.val[1]) == 0; \
-} \
-inline bool v_check_any(const _Tpvec& a) \
-{ \
-    v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift, vl)); \
-    return (v.val[0] | v.val[1]) != 0; \
-}
-
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 16)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 8)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 4)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 2)
-
-
-inline bool v_check_all(const v_int8x16& a)
-{ return v_check_all(v_reinterpret_as_u8(a)); }
-inline bool v_check_any(const v_int8x16& a)
-{ return v_check_any(v_reinterpret_as_u8(a)); }
-
-inline bool v_check_all(const v_int16x8& a)
-{ return v_check_all(v_reinterpret_as_u16(a)); }
-inline bool v_check_any(const v_int16x8& a)
-{ return v_check_any(v_reinterpret_as_u16(a)); }
-
-inline bool v_check_all(const v_int32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-inline bool v_check_any(const v_int32x4& a)
-{ return v_check_any(v_reinterpret_as_u32(a)); }
-
-inline bool v_check_all(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_u32(a)); }
-inline bool v_check_any(const v_float32x4& a)
-{ return v_check_any(v_reinterpret_as_u32(a)); }
-
-inline bool v_check_all(const v_int64x2& a)
-{ return v_check_all(v_reinterpret_as_u64(a)); }
-inline bool v_check_any(const v_int64x2& a)
-{ return v_check_any(v_reinterpret_as_u64(a)); }
-
-#if CV_SIMD128_64F
-inline bool v_check_all(const v_float64x2& a)
-{ return v_check_all(v_reinterpret_as_u64(a)); }
-inline bool v_check_any(const v_float64x2& a)
-{ return v_check_any(v_reinterpret_as_u64(a)); }
-#endif
-#else
-#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
-inline bool v_check_all(const _Tpvec& a) \
-{ \
-    return vcpop(vmslt(a, 0, vl), vl) == vl; \
-} \
-inline bool v_check_any(const _Tpvec& a) \
-{ \
-    return vcpop(vmslt(a, 0, vl), vl) != 0; \
-}
-
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8x16, 16)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16x8, 8)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32x4, 4)
-OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64x2, 2)
-
-
-inline bool v_check_all(const v_uint8x16& a)
-{ return v_check_all(v_reinterpret_as_s8(a)); }
-inline bool v_check_any(const v_uint8x16& a)
-{ return v_check_any(v_reinterpret_as_s8(a)); }
-
-inline bool v_check_all(const v_uint16x8& a)
-{ return v_check_all(v_reinterpret_as_s16(a)); }
-inline bool v_check_any(const v_uint16x8& a)
-{ return v_check_any(v_reinterpret_as_s16(a)); }
-
-inline bool v_check_all(const v_uint32x4& a)
-{ return v_check_all(v_reinterpret_as_s32(a)); }
-inline bool v_check_any(const v_uint32x4& a)
-{ return v_check_any(v_reinterpret_as_s32(a)); }
-
-inline bool v_check_all(const v_float32x4& a)
-{ return v_check_all(v_reinterpret_as_s32(a)); }
-inline bool v_check_any(const v_float32x4& a)
-{ return v_check_any(v_reinterpret_as_s32(a)); }
-
-inline bool v_check_all(const v_uint64x2& a)
-{ return v_check_all(v_reinterpret_as_s64(a)); }
-inline bool v_check_any(const v_uint64x2& a)
-{ return v_check_any(v_reinterpret_as_s64(a)); }
-
-#if CV_SIMD128_64F
-inline bool v_check_all(const v_float64x2& a)
-{ return v_check_all(v_reinterpret_as_s64(a)); }
-inline bool v_check_any(const v_float64x2& a)
-{ return v_check_any(v_reinterpret_as_s64(a)); }
-#endif
-#endif
-////////////// abs //////////////
-
-#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
-inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return v_max(a, b) - v_min(a, b); \
-}
-
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
-#endif
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
-OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
-
-// use reinterpret instead of c-style casting.
-#ifndef __clang__
-#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, vl) \
-inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b), vl), 0, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 4)
-#else
-#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width, vl) \
-inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _rTpvec(rshr(vreinterpret_u##width##m2(sub(v_max(a, b), v_min(a, b), vl)), 0, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 16, 16)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 32, 8)
-OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 64, 4)
-#endif
-#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
-inline _Tprvec v_abs(const _Tpvec& a) \
-{ \
-    return v_absdiff(a, v_setzero_##suffix()); \
-}
-
-OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
-OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
-OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
-OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
-#endif
-
-
-#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
-inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return v_reduce_sum(v_absdiff(a, b)); \
-}
-
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
-OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
-
-////////////// Select //////////////
-
-#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, vl) \
-inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(merge(ne(mask, 0, vl), b, a, vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 16)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 16)
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 8)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 8)
-OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 4)
-OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 4)
-OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 4)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 2)
-#endif
-
-////////////// Rotate shift //////////////
-
-#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
-{ \
-    return _Tpvec(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
-} \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl)); \
-} \
-template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
-{ return a; } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
-} \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vmv_v_x_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
-} \
-template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
-{ CV_UNUSED(b); return a; }
-
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8x16, u8, 16)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8x16, i8, 16)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16x8, u16, 8)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16x8, i16, 8)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32x4, u32, 4)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32x4, i32, 4)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64x2, u64, 2)
-OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64x2, i64, 2)
-
-#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
-{ \
-    return _Tpvec(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
-} \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl)); \
-} \
-template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
-{ return a; } \
-template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), a, n, vl), b, _Tpvec::nlanes - n, vl)); \
-} \
-template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
-{ \
-    return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vfmv_v_f_##suffix##m1(0, vl), b, _Tpvec::nlanes - n, vl), a, n, vl)); \
-} \
-template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
-{ CV_UNUSED(b); return a; }
-
-OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32x4, f32, 4)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64x2, f64, 2)
-#endif
-
-////////////// Convert to float //////////////
-
-inline v_float32x4 v_cvt_f32(const v_int32x4& a)
-{
-    return v_float32x4(vfcvt_f_x_v_f32m1(a, 4));
-}
-
-#if CV_SIMD128_64F
-#ifndef __clang__
-inline v_float32x4 v_cvt_f32(const v_float64x2& a)
-{
-    double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
-}
-
-inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
-{
-    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_float32x4(vfncvt_f_f_w_f32m1(tmp, 4));
-}
-#else
-inline v_float32x4 v_cvt_f32(const v_float64x2& a)
-{
-    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
-    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
-}
-inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
-{
-    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
-    return v_float32x4(vfncvt_f_f_w_f32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
-}
-#endif
-
-inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
-    double ptr[4] = {0};
-    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
-    double elems[2] =
-    {
-        ptr[0], ptr[1]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
-{
-    double ptr[4] = {0};
-    vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a, 4), 4);
-    double elems[2] =
-    {
-        ptr[2], ptr[3]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
-    double ptr[4] = {0};
-    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
-    double elems[2] =
-    {
-        ptr[0], ptr[1]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
-{
-    double ptr[4] = {0};
-    vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a, 4), 4);
-    double elems[2] =
-    {
-        ptr[2], ptr[3]
-    };
-    return v_float64x2(vle64_v_f64m1(elems, 2));
-}
-
-inline v_float64x2 v_cvt_f64(const v_int64x2& a)
-{
-    return v_float64x2(vfcvt_f_x_v_f64m1(a, 2));
-}
-#endif
-
-////////////// Broadcast //////////////
-
-#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
-template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
-{ \
-    return v_setall_##suffix(v_extract_n<i>(v)); \
-}
-
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
-#endif
-
-////////////// Transpose4x4 //////////////
-
-#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
-inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
-                         const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
-                         v_##_Tpvec& b0, v_##_Tpvec& b1, \
-                         v_##_Tpvec& b2, v_##_Tpvec& b3) \
-{ \
-    _Tp elems0[4] = \
-    { \
-        v_extract_n<0>(a0), \
-        v_extract_n<0>(a1), \
-        v_extract_n<0>(a2), \
-        v_extract_n<0>(a3) \
-    }; \
-    b0 = v_load(elems0); \
-    _Tp elems1[4] = \
-    { \
-        v_extract_n<1>(a0), \
-        v_extract_n<1>(a1), \
-        v_extract_n<1>(a2), \
-        v_extract_n<1>(a3) \
-    }; \
-    b1 = v_load(elems1); \
-    _Tp elems2[4] = \
-    { \
-        v_extract_n<2>(a0), \
-        v_extract_n<2>(a1), \
-        v_extract_n<2>(a2), \
-        v_extract_n<2>(a3) \
-    }; \
-    b2 = v_load(elems2); \
-    _Tp elems3[4] = \
-    { \
-        v_extract_n<3>(a0), \
-        v_extract_n<3>(a1), \
-        v_extract_n<3>(a2), \
-        v_extract_n<3>(a3) \
-    }; \
-    b3 = v_load(elems3); \
-}
-
-OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
-OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
-
-////////////// Reverse //////////////
-
-#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, suffix) \
-inline _Tpvec v_reverse(const _Tpvec& a)  \
-{ \
-    _Tp ptr[_Tpvec::nlanes] = {0}; \
-    _Tp ptra[_Tpvec::nlanes] = {0}; \
-    v_store(ptra, a); \
-    for (int i = 0; i < _Tpvec::nlanes; i++) \
-    { \
-        ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
-    } \
-    return v_load(ptr); \
-}
-
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, i8)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, i16)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, i32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, i64)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, f64)
-#endif
-
-//////////// Value reordering ////////////
-
-#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt, vl) \
-inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
-{ \
-    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
-    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
-    v_store_low(lptr, a); \
-    v_store_high(hptr, a); \
-    b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
-    b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
-} \
-inline _Tpwvec v_expand_low(const _Tpvec& a) \
-{ \
-    _Tp lptr[_Tpvec::nlanes/2] = {0}; \
-    v_store_low(lptr, a); \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr, vl), vl)); \
-} \
-inline _Tpwvec v_expand_high(const _Tpvec& a) \
-{ \
-    _Tp hptr[_Tpvec::nlanes/2] = {0}; \
-    v_store_high(hptr, a); \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr, vl), vl)); \
-} \
-inline _Tpwvec v_load_expand(const _Tp* ptr) \
-{ \
-    return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr, vl), vl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1, 8)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1, 8)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1, 4)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1, 4)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1, 2)
-OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1, 2)
-
-inline v_uint32x4 v_load_expand_q(const uchar* ptr)
-{
-    return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr, 4), 4), 4));
-}
-
-inline v_int32x4 v_load_expand_q(const schar* ptr)
-{
-    return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr, 4), 4), 4));
-}
-
-
-#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, shr, hvl, vl) \
-inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, b); \
-    return _Tpvec(shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl)); \
-} \
-inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
-    vse##hwidth##_v_##hsuffix##m1(ptr, shr(vle##width##_v_##suffix##m2(arr, vl), 0, vl), hvl); \
-} \
-template<int n> inline \
-_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, b); \
-    return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)); \
-} \
-template<int n> inline \
-void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
-    vse##hwidth##_v_##hsuffix##m1(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr, vl), n, vl)), hvl); \
-}
-
-OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 8, 16, u8, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1, 8, 16)
-OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 8, 16, i8, i16, vnclip_wx_i8m1, vnclip_wx_i8m1, 8, 16)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 16, 32, u16, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1, 4, 8)
-OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 16, 32, i16, i32, vnclip_wx_i16m1, vnclip_wx_i16m1, 4, 8)
-OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 32, 64, u32, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1, 2, 4)
-OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 32, 64, i32, i64, vnclip_wx_i32m1, vnsra_wx_i32m1, 2, 4)
-
-
-#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, rshr, cast, hvl, vl) \
-inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, b); \
-    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl)); \
-} \
-inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
-    vse##hwidth##_v_##hsuffix##m1(ptr, rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), 0, vl), hvl); \
-} \
-template<int n> inline \
-_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, b); \
-    return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl)); \
-} \
-template<int n> inline \
-void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
-{ \
-    _wTp arr[_Tpvec::nlanes] = {0}; \
-    v_store(arr, a); \
-    v_store(arr + _wTpvec::nlanes, _wTpvec(vmv_v_x_##suffix##m1(0, hvl))); \
-    v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr, vl), 0, vl)), n, vl))); \
-}
-
-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 8, 16, u8, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2, 8, 16)
-OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 16, 32, u16, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2, 4, 8)
-
-
-#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, suffix) \
-inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
-{ \
-    _Tp ptra0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptra1[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb0[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb1[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptra0, a0); \
-    v_store(ptra1, a1); \
-    int i; \
-    for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
-    { \
-        ptrb0[i*2] = ptra0[i]; \
-        ptrb0[i*2+1] = ptra1[i]; \
-    } \
-    for( ; i < v_##_Tpvec::nlanes; i++ ) \
-    { \
-        ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
-        ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
-    } \
-    b0 = v_load(ptrb0); \
-    b1 = v_load(ptrb1); \
-} \
-inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
-{ \
-    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
-    v_store_low(ptra, a); \
-    v_store_low(ptrb, b); \
-    return v_load_halves(ptra, ptrb); \
-} \
-inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
-{ \
-    _Tp ptra[v_##_Tpvec::nlanes/2] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes/2] = {0}; \
-    v_store_high(ptra, a); \
-    v_store_high(ptrb, b); \
-    return v_load_halves(ptra, ptrb); \
-} \
-inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
-{ \
-    c = v_combine_low(a, b); \
-    d = v_combine_high(a, b); \
-}
-
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, i8)
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, i16)
-OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, i32)
-OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, f32)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, f64)
-#endif
-
-
-#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp) \
-inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
-{ \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    int i, i2; \
-    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
-    { \
-        ptra[i] = ptr[i2]; \
-        ptrb[i] = ptr[i2+1]; \
-    } \
-    a = v_load(ptra); \
-    b = v_load(ptrb); \
-} \
-inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
-{ \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
-    int i, i3; \
-    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
-    { \
-        ptra[i] = ptr[i3]; \
-        ptrb[i] = ptr[i3+1]; \
-        ptrc[i] = ptr[i3+2]; \
-    } \
-    a = v_load(ptra); \
-    b = v_load(ptrb); \
-    c = v_load(ptrc); \
-} \
-inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
-                                v_##_Tpvec& c, v_##_Tpvec& d) \
-{ \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
-    int i, i4; \
-    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
-    { \
-        ptra[i] = ptr[i4]; \
-        ptrb[i] = ptr[i4+1]; \
-        ptrc[i] = ptr[i4+2]; \
-        ptrd[i] = ptr[i4+3]; \
-    } \
-    a = v_load(ptra); \
-    b = v_load(ptrb); \
-    c = v_load(ptrc); \
-    d = v_load(ptrd); \
-} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    int i, i2; \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptra, a); \
-    v_store(ptrb, b); \
-    for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
-    { \
-        ptr[i2] = ptra[i]; \
-        ptr[i2+1] = ptrb[i]; \
-    } \
-} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    int i, i3; \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptra, a); \
-    v_store(ptrb, b); \
-    v_store(ptrc, c); \
-    for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
-    { \
-        ptr[i3] = ptra[i]; \
-        ptr[i3+1] = ptrb[i]; \
-        ptr[i3+2] = ptrc[i]; \
-    } \
-} \
-inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
-                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
-                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
-{ \
-    int i, i4; \
-    _Tp ptra[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrb[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrc[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrd[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptra, a); \
-    v_store(ptrb, b); \
-    v_store(ptrc, c); \
-    v_store(ptrd, d); \
-    for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
-    { \
-        ptr[i4] = ptra[i]; \
-        ptr[i4+1] = ptrb[i]; \
-        ptr[i4+2] = ptrc[i]; \
-        ptr[i4+3] = ptrd[i]; \
-    } \
-} \
-inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
-{ \
-    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptrvec, vec); \
-    for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
-    { \
-        ptr[4*i  ] = ptrvec[4*i  ]; \
-        ptr[4*i+1] = ptrvec[4*i+2]; \
-        ptr[4*i+2] = ptrvec[4*i+1]; \
-        ptr[4*i+3] = ptrvec[4*i+3]; \
-    } \
-    return v_load(ptr); \
-} \
-inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
-{ \
-    _Tp ptr[v_##_Tpvec::nlanes] = {0}; \
-    _Tp ptrvec[v_##_Tpvec::nlanes] = {0}; \
-    v_store(ptrvec, vec); \
-    for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
-    { \
-        ptr[8*i  ] = ptrvec[4*i  ]; \
-        ptr[8*i+1] = ptrvec[4*i+4]; \
-        ptr[8*i+2] = ptrvec[4*i+1]; \
-        ptr[8*i+3] = ptrvec[4*i+5]; \
-        ptr[8*i+4] = ptrvec[4*i+2]; \
-        ptr[8*i+5] = ptrvec[4*i+6]; \
-        ptr[8*i+6] = ptrvec[4*i+3]; \
-        ptr[8*i+7] = ptrvec[4*i+7]; \
-    } \
-    return v_load(ptr); \
-}
-
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64)
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double)
-#endif
-
-//////////// PopCount ////////////
-
-static const unsigned char popCountTable[] =
-{
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
-};
-
-#define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
-inline _rTpvec v_popcount(const _Tpvec& a) \
-{ \
-    uchar ptra[16] = {0}; \
-    v_store(ptra, v_reinterpret_as_u8(a)); \
-    _rTp ptr[_Tpvec::nlanes] = {0}; \
-    v_store(ptr, v_setzero_##suffix()); \
-    for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
-        ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
-    return v_load(ptr); \
-}
-
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
-OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
-
-//////////// SignMask ////////////
-
-#ifndef __clang__
-#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, vl, shift) \
-inline int v_signmask(const _Tpvec& a) \
-{ \
-    int mask = 0; \
-    _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift, vl)); \
-    for( int i = 0; i < _Tpvec::nlanes; i++ ) \
-        mask |= (int)(tmp.val[i]) << i; \
-    return mask; \
-}
-
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 16, 7)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 8, 15)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 4, 31)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 2, 63)
-
-inline int v_signmask(const v_int8x16& a)
-{ return v_signmask(v_reinterpret_as_u8(a)); }
-inline int v_signmask(const v_int16x8& a)
-{ return v_signmask(v_reinterpret_as_u16(a)); }
-inline int v_signmask(const v_int32x4& a)
-{ return v_signmask(v_reinterpret_as_u32(a)); }
-inline int v_signmask(const v_float32x4& a)
-{ return v_signmask(v_reinterpret_as_u32(a)); }
-inline int v_signmask(const v_int64x2& a)
-{ return v_signmask(v_reinterpret_as_u64(a)); }
-#if CV_SIMD128_64F
-inline int v_signmask(const v_float64x2& a)
-{ return v_signmask(v_reinterpret_as_u64(a)); }
-#endif
-
-#else
-#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, width, vl) \
-inline int v_signmask(const _Tpvec& a) \
-{ \
-    uint8_t ans[16] = {0};\
-    vsm(ans, vmslt(a, 0, vl), vl);\
-    return reinterpret_cast<int*>(ans)[0];\
-}
-
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8x16, 8, 16)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16x8, 16, 8)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32x4, 32, 4)
-OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64x2, 64, 2)
-
-inline int v_signmask(const v_uint8x16& a)
-{ return v_signmask(v_reinterpret_as_s8(a)); }
-inline int v_signmask(const v_uint16x8& a)
-{ return v_signmask(v_reinterpret_as_s16(a)); }
-inline int v_signmask(const v_uint32x4& a)
-{ return v_signmask(v_reinterpret_as_s32(a)); }
-inline int v_signmask(const v_float32x4& a)
-{ return v_signmask(v_reinterpret_as_s32(a)); }
-inline int v_signmask(const v_uint64x2& a)
-{ return v_signmask(v_reinterpret_as_s64(a)); }
-#if CV_SIMD128_64F
-inline int v_signmask(const v_float64x2& a)
-{ return v_signmask(v_reinterpret_as_s64(a)); }
-#endif
-
-#endif
-
-//////////// Scan forward ////////////
-
-#define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
-inline int v_scan_forward(const _Tpvec& a) \
-{ \
-    _Tp ptr[_Tpvec::nlanes] = {0}; \
-    v_store(ptr, v_reinterpret_as_##suffix(a)); \
-    for (int i = 0; i < _Tpvec::nlanes; i++) \
-        if(int(ptr[i]) < 0) \
-            return i; \
-    return 0; \
-}
-
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
-#if CV_SIMD128_64F
-OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
-#endif
-
-//////////// Pack triplets ////////////
-
-// use reinterpret instead of c-style casting.
-#ifndef __clang__
-inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
-{
-    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)vint8m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
-}
-inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
-{
-    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
-}
-
-inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
-{
-    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vint16m1_t(vec), (vuint8m1_t)vle64_v_u64m1(ptr, 2), 16));
-}
-inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
-{
-    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
-}
-
-inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
-inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
-inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
-
-#else
-
-inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
-{
-    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16(vreinterpret_i8m1(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16)));
-}
-inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec)
-{
-    return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec)));
-}
-
-inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
-{
-    uint64 ptr[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int16x8(v_reinterpret_as_s16(v_uint8x16(vrgather_vv_u8m1(v_reinterpret_as_u8(vec), vreinterpret_u8m1(vle64_v_u64m1(ptr, 2)), 16))));
-}
-inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec)
-{
-    return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec)));
-}
-
-inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
-inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
-inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
-
-#endif
-
-////// FP16 support ///////
-
-#if CV_FP16
-inline v_float32x4 v_load_expand(const float16_t* ptr)
-{
-    return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr, 4), 4));
-}
-
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
-{
-    vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v, 4), 4);
-}
-#else
-inline v_float32x4 v_load_expand(const float16_t* ptr)
-{
-    const int N = 4;
-    float buf[N];
-    for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
-    return v_load(buf);
-}
-
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
-{
-    const int N = 4;
-    float buf[N];
-    v_store(buf, v);
-    for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
-}
-#endif
-
-////////////// Rounding //////////////
-
-inline v_int32x4 v_round(const v_float32x4& a)
-{
-    return v_int32x4(vfcvt_x_f_v_i32m1(a, 4));
-}
-
-inline v_int32x4 v_floor(const v_float32x4& a)
-{
-    v_float32x4 ZP5 = v_setall_f32(0.5f);
-    v_float32x4 t = a - ZP5;
-    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
-}
-
-inline v_int32x4 v_ceil(const v_float32x4& a)
-{
-    v_float32x4 ZP5 = v_setall_f32(0.5f);
-    v_float32x4 t = a + ZP5;
-    return v_int32x4(vfcvt_x_f_v_i32m1(t, 4));
-}
-
-inline v_int32x4 v_trunc(const v_float32x4& a)
-{
-    return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a, 4));
-}
-#if CV_SIMD128_64F
-#ifndef __clang__
-inline v_int32x4 v_round(const v_float64x2& a)
-{
-    double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
-}
-
-inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
-{
-    double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
-}
-
-inline v_int32x4 v_floor(const v_float64x2& a)
-{
-    double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
-}
-
-inline v_int32x4 v_ceil(const v_float64x2& a)
-{
-    double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_int32x4(vfncvt_x_f_w_i32m1(tmp, 4));
-}
-
-inline v_int32x4 v_trunc(const v_float64x2& a)
-{
-    double arr[4] = {a.val[0], a.val[1], 0, 0};
-    vfloat64m2_t tmp = vle64_v_f64m2(arr, 4);
-    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp, 4));
-}
-
-#else
-inline v_int32x4 v_round(const v_float64x2& a)
-{
-    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
-    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
-}
-
-inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
-{
-    vfloat64m2_t dst = vlmul_ext_v_f64m1_f64m2(a);
-    return v_int32x4(vfncvt_x_f_w_i32m1(vset_v_f64m1_f64m2(dst, 1, b), 4));
-}
-
-inline v_int32x4 v_floor(const v_float64x2& a)
-{
-    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
-    dst = vset_v_f64m1_f64m2(dst, 0, a);
-    dst = vfsub_vf_f64m2(dst, 0.5, 2);
-    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
-}
-
-inline v_int32x4 v_ceil(const v_float64x2& a)
-{
-    vfloat64m2_t dst = vfmv_v_f_f64m2(0, 4);
-    dst = vset_v_f64m1_f64m2(dst, 0, a);
-    dst = vfadd_vf_f64m2(dst, 0.5, 2);
-    return v_int32x4(vfncvt_x_f_w_i32m1(dst, 4));
-}
-
-inline v_int32x4 v_trunc(const v_float64x2& a)
-{
-    vfloat64m2_t zero = vfmv_v_f_f64m2(0, 4);
-    return v_int32x4(vfncvt_rtz_x_f_w_i32m1(vset_v_f64m1_f64m2(zero, 0, a), 4));
-}
-#endif
-#endif
-
-
-//////// Dot Product ////////
-
-// 16 >> 32
-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
-{
-    int ptr[8] = {0};
-    v_int32x4 t1, t2;
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
-    v_load_deinterleave(ptr, t1, t2);
-    return t1 + t2;
-}
-inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{
-    int ptr[8] = {0};
-    v_int32x4 t1, t2;
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
-    v_load_deinterleave(ptr, t1, t2);
-    return t1 + t2 + c;
-}
-
-// 32 >> 64
-inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
-{
-    int64 ptr[4] = {0};
-    v_int64x2 t1, t2;
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
-    v_load_deinterleave(ptr, t1, t2);
-    return t1 + t2;
-}
-inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{
-    int64 ptr[4] = {0};
-    v_int64x2 t1, t2;
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
-    v_load_deinterleave(ptr, t1, t2);
-    return t1 + t2 + c;
-}
-
-// 8 >> 32
-inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
-{
-    unsigned ptr[16] = {0};
-    v_uint32x4 t1, t2, t3, t4;
-    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4;
-}
-inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
-                                   const v_uint32x4& c)
-{
-    unsigned ptr[16] = {0};
-    v_uint32x4 t1, t2, t3, t4;
-    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
-{
-    int ptr[16] = {0};
-    v_int32x4 t1, t2, t3, t4;
-    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4;
-}
-inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
-                                  const v_int32x4& c)
-{
-    int ptr[16] = {0};
-    v_int32x4 t1, t2, t3, t4;
-    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-// 16 >> 64
-inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
-{
-    uint64 ptr[8] = {0};
-    v_uint64x2 t1, t2, t3, t4;
-    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4;
-}
-inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{
-    uint64 ptr[8] = {0};
-    v_uint64x2 t1, t2, t3, t4;
-    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
-{
-    int64 ptr[8] = {0};
-    v_int64x2 t1, t2, t3, t4;
-    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4;
-}
-inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
-                                  const v_int64x2& c)
-{
-    int64 ptr[8] = {0};
-    v_int64x2 t1, t2, t3, t4;
-    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
-    v_load_deinterleave(ptr, t1, t2, t3, t4);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-// 32 >> 64f
-#if CV_SIMD128_64F
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
-{ return v_cvt_f64(v_dotprod(a, b)); }
-inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
-                                    const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
-#endif
-
-//////// Fast Dot Product ////////
-
-// 16 >> 32
-inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
-{
-    int ptr[8] = {0};
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
-    v_int32x4 t1 = v_load(ptr);
-    v_int32x4 t2 = v_load(ptr+4);
-    return t1 + t2;
-}
-inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{
-    int ptr[8] = {0};
-    vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b, 8), 8);
-    v_int32x4 t1 = v_load(ptr);
-    v_int32x4 t2 = v_load(ptr+4);
-    return t1 + t2 + c;
-}
-
-// 32 >> 64
-inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
-{
-    int64 ptr[4] = {0};
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
-    v_int64x2 t1 = v_load(ptr);
-    v_int64x2 t2 = v_load(ptr+2);
-    return t1 + t2;
-}
-inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{
-    int64 ptr[4] = {0};
-    vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b, 4), 4);
-    v_int64x2 t1 = v_load(ptr);
-    v_int64x2 t2 = v_load(ptr+2);
-    return t1 + t2 + c;
-}
-
-
-// 8 >> 32
-inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
-{
-    unsigned ptr[16] = {0};
-    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
-    v_uint32x4 t1 = v_load(ptr);
-    v_uint32x4 t2 = v_load(ptr+4);
-    v_uint32x4 t3 = v_load(ptr+8);
-    v_uint32x4 t4 = v_load(ptr+12);
-    return t1 + t2 + t3 + t4;
-}
-inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{
-    unsigned ptr[16] = {0};
-    vse32_v_u32m4(ptr, vwcvtu_x_x_v_u32m4(vwmulu_vv_u16m2(a, b, 16), 16), 16);
-    v_uint32x4 t1 = v_load(ptr);
-    v_uint32x4 t2 = v_load(ptr+4);
-    v_uint32x4 t3 = v_load(ptr+8);
-    v_uint32x4 t4 = v_load(ptr+12);
-    return t1 + t2 + t3 + t4 + c;
-}
-inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
-{
-    int ptr[16] = {0};
-    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
-    v_int32x4 t1 = v_load(ptr);
-    v_int32x4 t2 = v_load(ptr+4);
-    v_int32x4 t3 = v_load(ptr+8);
-    v_int32x4 t4 = v_load(ptr+12);
-    return t1 + t2 + t3 + t4;
-}
-inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{
-    int ptr[16] = {0};
-    vse32_v_i32m4(ptr, vwcvt_x_x_v_i32m4(vwmul_vv_i16m2(a, b, 16), 16), 16);
-    v_int32x4 t1 = v_load(ptr);
-    v_int32x4 t2 = v_load(ptr+4);
-    v_int32x4 t3 = v_load(ptr+8);
-    v_int32x4 t4 = v_load(ptr+12);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-// 16 >> 64
-inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
-{
-    uint64 ptr[8] = {0};
-    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
-    v_uint64x2 t1 = v_load(ptr);
-    v_uint64x2 t2 = v_load(ptr+2);
-    v_uint64x2 t3 = v_load(ptr+4);
-    v_uint64x2 t4 = v_load(ptr+6);
-    return t1 + t2 + t3 + t4;
-}
-inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{
-    uint64 ptr[8] = {0};
-    vse64_v_u64m4(ptr, vwcvtu_x_x_v_u64m4(vwmulu_vv_u32m2(a, b, 8), 8), 8);
-    v_uint64x2 t1 = v_load(ptr);
-    v_uint64x2 t2 = v_load(ptr+2);
-    v_uint64x2 t3 = v_load(ptr+4);
-    v_uint64x2 t4 = v_load(ptr+6);
-    return t1 + t2 + t3 + t4 + c;
-}
-inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
-{
-    int64 ptr[8] = {0};
-    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
-    v_int64x2 t1 = v_load(ptr);
-    v_int64x2 t2 = v_load(ptr+2);
-    v_int64x2 t3 = v_load(ptr+4);
-    v_int64x2 t4 = v_load(ptr+6);
-    return t1 + t2 + t3 + t4;
-}
-inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{
-    int64 ptr[8] = {0};
-    vse64_v_i64m4(ptr, vwcvt_x_x_v_i64m4(vwmul_vv_i32m2(a, b, 8), 8), 8);
-    v_int64x2 t1 = v_load(ptr);
-    v_int64x2 t2 = v_load(ptr+2);
-    v_int64x2 t3 = v_load(ptr+4);
-    v_int64x2 t4 = v_load(ptr+6);
-    return t1 + t2 + t3 + t4 + c;
-}
-
-// 32 >> 64f
-#if CV_SIMD128_64F
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
-{ return v_cvt_f64(v_dotprod_fast(a, b)); }
-inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
-#endif
-
-
-inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
-                            const v_float32x4& m1, const v_float32x4& m2,
-                            const v_float32x4& m3)
-{
-    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
-    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
-    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
-    res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3, 4);
-    return v_float32x4(res);
-}
-
-inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
-                               const v_float32x4& m1, const v_float32x4& m2,
-                               const v_float32x4& a)
-{
-    vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v), 4);
-    res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1, 4);
-    res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2, 4);
-    return v_float32x4(res) + a;
-}
-
-#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width, vl, hvl) \
-inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
-{ \
-    _Tpw ptr[_Tpwvec::nlanes*2] = {0}; \
-    vse##width##_v_##suffix##m2(ptr, wmul(a, b, vl), vl); \
-    c = _Tpwvec(vle##width##_v_##suffix##m1(ptr, hvl)); \
-    d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes, hvl)); \
-}
-
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16, 16, 8)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16, 16, 8)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32, 8, 4)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32, 8, 4)
-OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64, 4, 2)
-
-
-inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
-{
-    return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b, 8), 16, 8));
-}
-inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
-{
-    return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b, 8), 16, 8));
-}
-
-
-//////// Saturating Multiply ////////
-
-#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
-inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
-{ \
-    _wTpvec c, d; \
-    v_mul_expand(a, b, c, d); \
-    return v_pack(c, d); \
-} \
-inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a = a * b; \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
-OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
-OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
-OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
-
-
-inline void v_cleanup() {}
-
-CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
-
-
-}
-
-#endif
diff --git a/3rdParty/opencv2/core/hal/intrin_rvv071.hpp b/3rdParty/opencv2/core/hal/intrin_rvv071.hpp
index d4f3751dd7..bcd2662082 100644
--- a/3rdParty/opencv2/core/hal/intrin_rvv071.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_rvv071.hpp
@@ -32,11 +32,11 @@ struct v_uint8x16
                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
     {
         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
+        val = (vuint8m1_t)vle8_v_u8m1((unsigned char*)v, 16);
     }
     uchar get0() const
     {
-        return vmv_x_s_u8m1_u8(val, 16);
+        return vmv_x_s_u8m1_u8(val);
     }
 
     vuint8m1_t val;
@@ -53,11 +53,11 @@ struct v_int8x16
                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
     {
         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
-        val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
+        val = (vint8m1_t)vle8_v_i8m1((schar*)v, 16);
     }
     schar get0() const
     {
-        return vmv_x_s_i8m1_i8(val, 16);
+        return vmv_x_s_i8m1_i8(val);
     }
 
     vint8m1_t val;
@@ -73,11 +73,11 @@ struct v_uint16x8
     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
     {
         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
+        val = (vuint16m1_t)vle16_v_u16m1((unsigned short*)v, 8);
     }
     ushort get0() const
     {
-        return vmv_x_s_u16m1_u16(val, 8);
+        return vmv_x_s_u16m1_u16(val);
     }
 
     vuint16m1_t val;
@@ -93,11 +93,11 @@ struct v_int16x8
     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
     {
         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
-        val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
+        val = (vint16m1_t)vle16_v_i16m1((signed short*)v, 8);
     }
     short get0() const
     {
-        return vmv_x_s_i16m1_i16(val, 8);
+        return vmv_x_s_i16m1_i16(val);
     }
 
     vint16m1_t val;
@@ -113,11 +113,11 @@ struct v_uint32x4
     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
     {
         unsigned v[] = {v0, v1, v2, v3};
-        val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
+        val = (vuint32m1_t)vle32_v_u32m1((unsigned int*)v, 4);
     }
     unsigned get0() const
     {
-        return vmv_x_s_u32m1_u32(val, 4);
+        return vmv_x_s_u32m1_u32(val);
     }
 
     vuint32m1_t val;
@@ -133,11 +133,11 @@ struct v_int32x4
     v_int32x4(int v0, int v1, int v2, int v3)
     {
         int v[] = {v0, v1, v2, v3};
-        val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
+        val = (vint32m1_t)vle32_v_i32m1((signed int*)v, 4);
     }
     int get0() const
     {
-        return vmv_x_s_i32m1_i32(val, 4);
+        return vmv_x_s_i32m1_i32(val);
     }
     vint32m1_t val;
 };
@@ -152,11 +152,11 @@ struct v_float32x4
     v_float32x4(float v0, float v1, float v2, float v3)
     {
         float v[] = {v0, v1, v2, v3};
-        val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
+        val = (vfloat32m1_t)vle32_v_f32m1((float*)v, 4);
     }
     float get0() const
     {
-        return vfmv_f_s_f32m1_f32(val, 4);
+        return vfmv_f_s_f32m1_f32(val);
     }
     vfloat32m1_t val;
 };
@@ -171,11 +171,11 @@ struct v_uint64x2
     v_uint64x2(uint64 v0, uint64 v1)
     {
         uint64 v[] = {v0, v1};
-        val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
+        val = (vuint64m1_t)vle64_v_u64m1((unsigned long*)v, 2);
     }
     uint64 get0() const
     {
-        return vmv_x_s_u64m1_u64(val, 2);
+        return vmv_x_s_u64m1_u64(val);
     }
     vuint64m1_t val;
 };
@@ -190,11 +190,11 @@ struct v_int64x2
     v_int64x2(int64 v0, int64 v1)
     {
         int64 v[] = {v0, v1};
-        val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
+        val = (vint64m1_t)vle64_v_i64m1((long*)v, 2);
     }
     int64 get0() const
     {
-        return vmv_x_s_i64m1_i64(val, 2);
+        return vmv_x_s_i64m1_i64(val);
     }
     vint64m1_t val;
 };
@@ -209,21 +209,21 @@ struct v_float64x2
     v_float64x2(double v0, double v1)
     {
         double v[] = {v0, v1};
-        val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
+        val = (vfloat64m1_t)vle64_v_f64m1((double*)v, 2);
     }
     double get0() const
     {
-        return vfmv_f_s_f64m1_f64(val, 2);
+        return vfmv_f_s_f64m1_f64(val);
     }
     vfloat64m1_t val;
 };
-
+/*
 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
-inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
+inline _Tp##m1_t vreinterpret_v_##suffix##m1_##suffix##m1(_Tp##m1_t v) { return v; } \
 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
-inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); } \
 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
@@ -233,99 +233,197 @@ inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(
 
 
 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
-OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
+OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, i8)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
-OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
+OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, i16)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
-OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
+OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, i32)
 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
-OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
+OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, i64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
+*/
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint8x16& v) { return v_uint8x16(v.val); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint8x16& v) { return v_int8x16(vreinterpret_v_u8m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint8x16& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(vreinterpret_v_u8m1_u16m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint8x16& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint8x16& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint8x16& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u8m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint8x16& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u8m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int8x16& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int8x16& v) { return v_int8x16(v.val); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int8x16& v) { return v_uint16x8(vreinterpret_v_u8m1_u16m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int8x16& v) { return v_int16x8(vreinterpret_v_i8m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int8x16& v) { return v_uint32x4(vreinterpret_v_u8m1_u32m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int8x16& v) { return v_int32x4(vreinterpret_v_i8m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int8x16& v) { return v_uint64x2(vreinterpret_v_u8m1_u64m1(vreinterpret_v_i8m1_u8m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int8x16& v) { return v_int64x2(vreinterpret_v_i8m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int8x16& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i8m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int8x16& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i8m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint16x8& v) { return v_uint8x16(vreinterpret_v_u16m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(vreinterpret_v_u16m1_i16m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint16x8& v) { return v_uint16x8(v.val); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint16x8& v) { return v_int16x8(vreinterpret_v_u16m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint16x8& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint16x8& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint16x8& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u16m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint16x8& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u16m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int16x8& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int16x8& v) { return v_int8x16(vreinterpret_v_i16m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int16x8& v) { return v_uint16x8(vreinterpret_v_i16m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int16x8& v) { return v_int16x8(v.val); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int16x8& v) { return v_uint32x4(vreinterpret_v_u16m1_u32m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int16x8& v) { return v_int32x4(vreinterpret_v_i16m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int16x8& v) { return v_uint64x2(vreinterpret_v_u16m1_u64m1(vreinterpret_v_i16m1_u16m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int16x8& v) { return v_int64x2(vreinterpret_v_i16m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int16x8& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i16m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int16x8& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i16m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_u32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint32x4& v) { return v_uint32x4(v.val); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint32x4& v) { return v_int32x4(vreinterpret_v_u32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint32x4& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint32x4& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint32x4& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(vreinterpret_v_u32m1_u64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int32x4& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int32x4& v) { return v_uint32x4(vreinterpret_v_i32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int32x4& v) { return v_int32x4(v.val); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_i32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int32x4& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(v.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(v.val))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_uint64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(v.val)); }
+inline v_int8x16 v_reinterpret_as_s8(const v_uint64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_uint64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(v.val)); }
+inline v_int16x8 v_reinterpret_as_s16(const v_uint64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_uint64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_uint64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_u64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_uint64x2& v) { return v_uint64x2(v.val); }
+inline v_int64x2 v_reinterpret_as_s64(const v_uint64x2& v) { return v_int64x2(vreinterpret_v_u64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& v) { return v_float32x4(vreinterpret_v_u32m1_f32m1(vreinterpret_v_u64m1_u32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& v) { return v_float64x2(vreinterpret_v_u64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_int64x2& v) { return v_uint8x16(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_int64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(v.val)); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_int64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_int64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(v.val)); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_int64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_i64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_int64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_int64x2& v) { return v_uint64x2(vreinterpret_v_i64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_int64x2& v) { return v_int64x2(v.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(v.val))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(v.val)); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float32x4& v) { return v_uint8x16(vreinterpret_v_u32m1_u8m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float32x4& v) { return v_int8x16(vreinterpret_v_i32m1_i8m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float32x4& v) { return v_uint16x8(vreinterpret_v_u32m1_u16m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float32x4& v) { return v_int16x8(vreinterpret_v_i32m1_i16m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float32x4& v) { return v_uint32x4(vreinterpret_v_f32m1_u32m1(v.val)); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float32x4& v) { return v_int32x4(vreinterpret_v_f32m1_i32m1(v.val)); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float32x4& v) { return v_uint64x2(vreinterpret_v_u32m1_u64m1(vreinterpret_v_f32m1_u32m1(v.val))); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float32x4& v) { return v_int64x2(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val))); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& v) { return v_float32x4(v.val); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& v) { return v_float64x2(vreinterpret_v_i64m1_f64m1(vreinterpret_v_i32m1_i64m1(vreinterpret_v_f32m1_i32m1(v.val)))); }
+
+inline v_uint8x16 v_reinterpret_as_u8(const v_float64x2& v) { return v_uint8x16(vreinterpret_v_u64m1_u8m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int8x16 v_reinterpret_as_s8(const v_float64x2& v) { return v_int8x16(vreinterpret_v_i64m1_i8m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint16x8 v_reinterpret_as_u16(const v_float64x2& v) { return v_uint16x8(vreinterpret_v_u64m1_u16m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int16x8 v_reinterpret_as_s16(const v_float64x2& v) { return v_int16x8(vreinterpret_v_i64m1_i16m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint32x4 v_reinterpret_as_u32(const v_float64x2& v) { return v_uint32x4(vreinterpret_v_u64m1_u32m1(vreinterpret_v_f64m1_u64m1(v.val))); }
+inline v_int32x4 v_reinterpret_as_s32(const v_float64x2& v) { return v_int32x4(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val))); }
+inline v_uint64x2 v_reinterpret_as_u64(const v_float64x2& v) { return v_uint64x2(vreinterpret_v_f64m1_u64m1(v.val)); }
+inline v_int64x2 v_reinterpret_as_s64(const v_float64x2& v) { return v_int64x2(vreinterpret_v_f64m1_i64m1(v.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& v) { return v_float32x4(vreinterpret_v_i32m1_f32m1(vreinterpret_v_i64m1_i32m1(vreinterpret_v_f64m1_i64m1(v.val)))); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& v) { return v_float64x2(v.val); }
+
 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
-inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
-inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
+inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num(vmv_v_x_##len##m1(0, num)); }     \
+inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); } \
+template <> inline v_##_Tp##x##num v_setzero_() { return v_setzero_##suffix(); }          \
+template <> inline v_##_Tp##x##num v_setall_(__Tp v) { return v_setall_##suffix(v); }
 
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
-OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
+OPENCV_HAL_IMPL_RISCVV_INIT_SET(schar, int8, s8, i8, 16)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
-inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
+inline v_float32x4 v_setzero_f32() { return v_float32x4(vfmv_v_f_f32m1(0, 4)); }
 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
 
 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
 
+template <> inline v_float32x4 v_setzero_() { return v_setzero_f32(); }
+template <> inline v_float32x4 v_setall_(float v) { return v_setall_f32(v); }
+
+template <> inline v_float64x2 v_setzero_() { return v_setzero_f64(); }
+template <> inline v_float64x2 v_setall_(double v) { return v_setall_f64(v); }
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val, num)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val, num); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
-inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
-{
-    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
 }
-inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
+
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint8x16, vsaddu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint8x16, vssubu_vv_u8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int8x16, vsadd_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int8x16, vssub_vv_i8m1, 16)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint16x8, vsaddu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint16x8, vssubu_vv_u16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int16x8, vsadd_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int16x8, vssub_vv_i16m1, 8)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int32x4, vadd_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int32x4, vsub_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_int32x4, vmul_vv_i32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint32x4, vadd_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint32x4, vsub_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_uint32x4, vmul_vv_u32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_int64x2, vadd_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_int64x2, vsub_vv_i64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_uint64x2, vadd_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_uint64x2, vsub_vv_u64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float32x4, vfadd_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float32x4, vfsub_vv_f32m1, 4)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float32x4, vfmul_vv_f32m1, 4)
+inline v_float32x4 v_div(const v_float32x4& a, const v_float32x4& b)
 {
-    a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
-    return a;
+    return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
 }
 
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
-OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
-inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_add, v_float64x2, vfadd_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_sub, v_float64x2, vfsub_vv_f64m1, 2)
+OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_mul, v_float64x2, vfmul_vv_f64m1, 2)
+inline v_float64x2 v_div(const v_float64x2& a, const v_float64x2& b)
 {
     return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
 }
-inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
-{
-    a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
-    return a;
-}
 // TODO: exp, log, sin, cos
 
 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
@@ -379,12 +477,12 @@ inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
 
 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
+    return v_float32x4(vfmadd_vv_f32m1(a.val, b.val, c.val, 4));
 }
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
+    return v_int32x4(vmadd_vv_i32m1(a.val, b.val, c.val, 4));
 }
 
 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
@@ -401,10 +499,10 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                             const v_float32x4& m1, const v_float32x4& m2,
                             const v_float32x4& m3)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 3, 4), m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
 
@@ -412,9 +510,9 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
                                const v_float32x4& m1, const v_float32x4& m2,
                                const v_float32x4& a)
 {
-    vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
-    res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
-    res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    vfloat32m1_t res = vfmul_vv_f32m1(m0.val, vrgather_vx_f32m1(v.val, 0, 4), 4);//vmuli_f32(m0.val, v.val, 0);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 1, 4), m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
+    res = vfmacc_vv_f32m1(res, vrgather_vx_f32m1(v.val, 2, 4), m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
     return v_float32x4(res);
 }
@@ -442,7 +540,7 @@ inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
 
 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
+    return v_float64x2(vfmadd_vv_f64m1(a.val, b.val, c.val, 2));
 }
 
 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
@@ -451,10 +549,10 @@ inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
-    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
-    inline _Tpvec operator ~ (const _Tpvec & a) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_and, _Tpvec, vand_vv_##suffix, num) \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_or, _Tpvec, vor_vv_##suffix, num)   \
+    OPENCV_HAL_IMPL_RISCVV_BIN_OPN(v_xor, _Tpvec, vxor_vv_##suffix, num) \
+    inline _Tpvec v_not(const _Tpvec & a) \
     { \
         return _Tpvec(vnot_v_##suffix(a.val, num)); \
     }
@@ -469,43 +567,33 @@ OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
-inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+inline v_float32x4 bin_op(const v_float32x4& a, const v_float32x4& b) \
 { \
-    return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
-} \
-inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
-{ \
-    a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
-    return a; \
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a.val), vreinterpret_v_f32m1_i32m1(b.val), 4))); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_and, vand_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_or, vor_vv_i32m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(v_xor, vxor_vv_i32m1)
 
-inline v_float32x4 operator ~ (const v_float32x4& a)
+inline v_float32x4 v_not(const v_float32x4& a)
 {
-    return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 4)));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
-inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
-{ \
-    return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
-} \
-inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+inline v_float64x2 bin_op(const v_float64x2& a, const v_float64x2& b) \
 { \
-    a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
-    return a; \
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a.val), vreinterpret_v_f64m1_i64m1(b.val), 2))); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
-OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_and, vand_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_or, vor_vv_i64m1)
+OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(v_xor, vxor_vv_i64m1)
 
-inline v_float64x2 operator ~ (const v_float64x2& a)
+inline v_float64x2 v_not(const v_float64x2& a)
 {
-    return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a.val), 2)));
 }
 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
 {
@@ -527,19 +615,19 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 inline v_uint32x4 v_abs(v_int32x4 x)
 {
     vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
-    return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
+    return v_uint32x4(vreinterpret_v_i32m1_u32m1(vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4)));
 }
 
 inline v_uint16x8 v_abs(v_int16x8 x)
 {
     vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
-    return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
+    return v_uint16x8(vreinterpret_v_i16m1_u16m1(vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8)));
 }
 
 inline v_uint8x16 v_abs(v_int8x16 x)
 {
     vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
-    return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
+    return v_uint8x16(vreinterpret_v_i8m1_u8m1(vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16)));
 }
 
 inline v_float32x4 v_abs(v_float32x4 x)
@@ -591,7 +679,7 @@ inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
      vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
      vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
-    return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
+    return v_uint##_Tpvec(vreinterpret_v_i##_Tpv##_u##_Tpv(vsub_vv_i##_Tpv(max, min, num)));    \
 }
 
 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
@@ -604,8 +692,8 @@ inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
 {
     vint16m2_t res = vundefined_i16m2();
     res = vwmul_vv_i16m2(a.val, b.val, 16);
-    c.val = vget_i16m2_i16m1(res, 0);
-    d.val = vget_i16m2_i16m1(res, 1);
+    c.val = vget_v_i16m2_i16m1(res, 0);
+    d.val = vget_v_i16m2_i16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
@@ -613,8 +701,8 @@ inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
 {
     vuint16m2_t res = vundefined_u16m2();
     res = vwmulu_vv_u16m2(a.val, b.val, 16);
-    c.val = vget_u16m2_u16m1(res, 0);
-    d.val = vget_u16m2_u16m1(res, 1);
+    c.val = vget_v_u16m2_u16m1(res, 0);
+    d.val = vget_v_u16m2_u16m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
@@ -622,8 +710,8 @@ inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 {
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    c.val = vget_i32m2_i32m1(res, 0);
-    d.val = vget_i32m2_i32m1(res, 1);
+    c.val = vget_v_i32m2_i32m1(res, 0);
+    d.val = vget_v_i32m2_i32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
@@ -631,8 +719,8 @@ inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 {
     vuint32m2_t res = vundefined_u32m2();
     res = vwmulu_vv_u32m2(a.val, b.val, 8);
-    c.val = vget_u32m2_u32m1(res, 0);
-    d.val = vget_u32m2_u32m1(res, 1);
+    c.val = vget_v_u32m2_u32m1(res, 0);
+    d.val = vget_v_u32m2_u32m1(res, 1);
 }
 
 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
@@ -640,8 +728,8 @@ inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
 {
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    c.val = vget_i64m2_i64m1(res, 0);
-    d.val = vget_i64m2_i64m1(res, 1);
+    c.val = vget_v_i64m2_i64m1(res, 0);
+    d.val = vget_v_i64m2_i64m1(res, 1);
 }
 
 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
@@ -649,8 +737,8 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint64m2_t res = vundefined_u64m2();
     res = vwmulu_vv_u64m2(a.val, b.val, 4);
-    c.val = vget_u64m2_u64m1(res, 0);
-    d.val = vget_u64m2_u64m1(res, 1);
+    c.val = vget_v_u64m2_u64m1(res, 0);
+    d.val = vget_v_u64m2_u64m1(res, 1);
 }
 
 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
@@ -669,118 +757,202 @@ OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
 // 16 >> 32
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0), vget_v_i32m2_i32m1(res, 1), 4));
 }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
+    vuint32m2_t vindex = vundefined_u32m2();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 1, 4);
+    vindex = vset_v_u32m1_u32m2(vindex, 0, vindex0);
+    vindex = vset_v_u32m1_u32m2(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4));
     vint32m2_t res = vundefined_i32m2();
     res = vwmul_vv_i32m2(a.val, b.val, 8);
-    res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
+    res = vrgather_vv_i32m2(res, vindex, 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(res, 0),vget_v_i32m2_i32m1(res, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
+    vuint64m2_t vindex = vundefined_u64m2();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 1, 2);
+    vindex = vset_v_u64m1_u64m2(vindex, 0, vindex0);
+    vindex = vset_v_u64m1_u64m2(vindex, 1, vadd_vx_u64m1(vindex0, 1, 2));
     vint64m2_t res = vundefined_i64m2();
     res = vwmul_vv_i64m2(a.val, b.val, 4);
-    res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
+    res = vrgather_vv_i64m2(res, vindex, 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(res, 0), vget_v_i64m2_i64m1(res, 1), 2), c.val, 2));
 }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
                                    const v_uint32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_u16m2(v1, vindex, 16);
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
                                    const v_int32x4& c)
 {
+    vuint32m4_t vindex32 = vundefined_u32m4();
+    vuint32m1_t vindex0 = vid_v_u32m1(4);
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 0, vindex0);
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 1, vadd_vx_u32m1(vindex0, 1, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 2, vadd_vx_u32m1(vindex0, 2, 4));
+    vindex32 = vset_v_u32m1_u32m4(vindex32, 3, vadd_vx_u32m1(vindex0, 3, 4));
+    vuint16m2_t vindex = vnsrl_wx_u16m2(vindex32, 0, 16);
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v1 = vrgather_vv_i16m2(v1, vindex, 16);
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
                                    const v_uint64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_u32m2(v1, vindex, 8);
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
                                    const v_int64x2& c)
 {
+    vuint64m4_t vindex64 = vundefined_u64m4();
+    vuint64m1_t vindex0 = vid_v_u64m1(2);
+    vindex0 = vsll_vx_u64m1(vindex0, 2, 2);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 0, vindex0);
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 1, vadd_vx_u64m1(vindex0, 1, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 2, vadd_vx_u64m1(vindex0, 2, 2));
+    vindex64 = vset_v_u64m1_u64m4(vindex64, 3, vadd_vx_u64m1(vindex0, 3, 2));
+    vuint32m2_t vindex = vnsrl_wx_u32m2(vindex64, 0, 8);
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v1 = vrgather_vv_i32m2(v1, vindex, 8);
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 //////// Fast Dot Product ////////
@@ -789,14 +961,14 @@ inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4));
 }
 
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4), c.val, 4));
 }
 
 // 32 >> 64
@@ -804,13 +976,13 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2));
 }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
     vint64m2_t v1 = vundefined_i64m2();
     v1 = vwmul_vv_i64m2(a.val, b.val, 8);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 4), c.val, 4));
 }
 
 // 8 >> 32
@@ -819,8 +991,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4));
 }
 
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -828,8 +1000,8 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     vuint16m2_t v1 = vundefined_u16m2();
     vuint32m2_t v2 = vundefined_u32m2();
     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
-    v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
-    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
+    v2 = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(v1, 0), vget_v_u16m2_u16m1(v1, 1), 8);
+    return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_v_u32m2_u32m1(v2, 0), vget_v_u32m2_u32m1(v2, 1), 4), c.val, 4));
 }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
@@ -837,16 +1009,16 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
 {
     vint16m2_t v1 = vundefined_i16m2();
     vint32m2_t v2 = vundefined_i32m2();
     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
-    v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
-    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
+    v2 = vwadd_vv_i32m2(vget_v_i16m2_i16m1(v1, 0), vget_v_i16m2_i16m1(v1, 1), 8);
+    return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_v_i32m2_i32m1(v2, 0), vget_v_i32m2_i32m1(v2, 1), 4), c.val, 4));
 }
 
 // 16 >> 64
@@ -855,16 +1027,16 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2));
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
 {
     vuint32m2_t v1 = vundefined_u32m2();
     vuint64m2_t v2 = vundefined_u64m2();
     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
-    v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
-    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
+    v2 = vwaddu_vv_u64m2(vget_v_u32m2_u32m1(v1, 0), vget_v_u32m2_u32m1(v1, 1), 4);
+    return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_v_u64m2_u64m1(v2, 0), vget_v_u64m2_u64m1(v2, 1), 2), c.val, 2));
 }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
@@ -872,16 +1044,16 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2));
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
 {
     vint32m2_t v1 = vundefined_i32m2();
     vint64m2_t v2 = vundefined_i64m2();
     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
-    v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
-    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
+    v2 = vwadd_vv_i64m2(vget_v_i32m2_i32m1(v1, 0), vget_v_i32m2_i32m1(v1, 1), 4);
+    return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v2, 0), vget_v_i64m2_i64m1(v2, 1), 2), c.val, 2));
 }
 
 
@@ -890,16 +1062,16 @@ inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
     v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
     val = intrin(val, a.val, val, num);    \
-    return vmv_x_s_##len##m1_##len(val, num);    \
+    return vmv_x_s_##len##m1_##len(val);    \
 }
 
 
-#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
+#define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num, scalerfunc) \
 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
 {\
-    v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
+    v##_Tpvec##m1_t val = vundefined_##_Tpvec2##m1(); \
     val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
-    return val[0];    \
+    return scalerfunc(val);    \
 }
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
@@ -910,30 +1082,30 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu
 inline float v_reduce_sum(const v_float32x4& a) \
 {\
     vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
-    val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
-    return vfmv_f_s_f32m1_f32(val, 4);    \
+    val = vfredosum_vs_f32m1_f32m1(val, a.val, val, 4);    \
+    return vfmv_f_s_f32m1_f32(val);    \
 }
 inline double v_reduce_sum(const v_float64x2& a) \
 {\
     vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
-    val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
-    return vfmv_f_s_f64m1_f64(val, 2);    \
+    val = vfredosum_vs_f64m1_f64m1(val, a.val, val, 2);    \
+    return vfmv_f_s_f64m1_f64(val);    \
 }
 inline uint64 v_reduce_sum(const v_uint64x2& a)
-{ return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
+{ vuint64m1_t res = vundefined_u64m1(); return vmv_x_s_u64m1_u64(vredsum_vs_u64m1_u64m1(res, a.val, vmv_v_x_u64m1(0, 2), 2)); }
 
 inline int64 v_reduce_sum(const v_int64x2& a)
-{ return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
+{ vint64m1_t res = vundefined_i64m1(); return vmv_x_s_i64m1_i64(vredsum_vs_i64m1_i64m1(res, a.val, vmv_v_x_i64m1(0, 2), 2)); }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
-OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16, vmv_x_s_i8m1_i8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8, vmv_x_s_i16m1_i16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4, vmv_x_s_i32m1_i32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2, vmv_x_s_i64m1_i64)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16, vmv_x_s_u8m1_u8)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8, vmv_x_s_u16m1_u16)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4, vmv_x_s_u32m1_u32)    \
+OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4, vfmv_f_s_f32m1_f32)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
 
@@ -944,11 +1116,15 @@ inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
     vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
     vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
-    b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
-    c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
-    d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
-    return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
+    a0 = vfredosum_vs_f32m1_f32m1(a0, a.val, a0, 4);
+    b0 = vfredosum_vs_f32m1_f32m1(b0, b.val, b0, 4);
+    c0 = vfredosum_vs_f32m1_f32m1(c0, c.val, c0, 4);
+    d0 = vfredosum_vs_f32m1_f32m1(d0, d.val, d0, 4);
+    vfloat32m1_t res;
+    res = vslideup_vx_f32m1(a0, b0, 1, 4);
+    res = vslideup_vx_f32m1(res, c0, 2, 4);
+    res = vslideup_vx_f32m1(res, d0, 3, 4);
+    return v_float32x4(res);
 }
 
 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
@@ -957,8 +1133,8 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
     vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
     vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
     vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
-    a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
-    return a0[0];
+    a0 = vfredosum_vs_f32m1_f32m1(a0, val, a0, 4);
+    return vfmv_f_s_f32m1_f32(a0);
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
@@ -975,32 +1151,32 @@ OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
 
 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
 } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { \
     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
@@ -1016,91 +1192,91 @@ OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
 
 //TODO: ==
-inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_eq(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_ne(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_lt(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_le(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_gt(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
-inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
+inline v_float32x4 v_ge(const v_float32x4& a, const v_float32x4& b)
 {
     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
-}
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
+}/**/
 inline v_float32x4 v_not_nan(const v_float32x4& a)
 {
-    vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
+    vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, a.val, 4);
     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
-    return v_float32x4((vfloat32m1_t)res);
+    return v_float32x4(vreinterpret_v_i32m1_f32m1(res));
 }
 
 //TODO: ==
-inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_eq(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_ne(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_lt(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_le(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_gt(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
-inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
+inline v_float64x2 v_ge(const v_float64x2& a, const v_float64x2& b)
 {
     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
-}
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
+}/**/
 inline v_float64x2 v_not_nan(const v_float64x2& a)
 {
-    vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
+    vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, a.val, 2);
     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
-    return v_float64x2((vfloat64m1_t)res);
+    return v_float64x2(vreinterpret_v_i64m1_f64m1(res));
 }
 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
@@ -1108,16 +1284,23 @@ inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
                          v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
                          v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
 { \
+    vuint32m4_t vindex = vundefined_u32m4(); \
+    vuint32m1_t vindex0 = vid_v_u32m1(4); \
+    vindex0 = vsll_vx_u32m1(vindex0, 2, 4); \
+    vindex = vset_v_u32m1_u32m4(vindex, 0, vindex0); \
+    vindex = vset_v_u32m1_u32m4(vindex, 1, vadd_vx_u32m1(vindex0, 1, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 2, vadd_vx_u32m1(vindex0, 2, 4)); \
+    vindex = vset_v_u32m1_u32m4(vindex, 3, vadd_vx_u32m1(vindex0, 3, 4)); \
     v##_Tp##32m4_t val = vundefined_##_T##m4();    \
-    val = vset_##_T##m4(val, 0, a0.val);    \
-    val = vset_##_T##m4(val, 1, a1.val);    \
-    val = vset_##_T##m4(val, 2, a2.val);    \
-    val = vset_##_T##m4(val, 3, a3.val);   \
-    val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
-    b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
-    b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
-    b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
-    b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
+    val = vset_v_##_T##m1_##_T##m4(val, 0, a0.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 1, a1.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 2, a2.val);    \
+    val = vset_v_##_T##m1_##_T##m4(val, 3, a3.val);   \
+    val = vrgather_vv_##_T##m4(val, vindex, 16);    \
+    b0.val = vget_v_##_T##m4_##_T##m1(val, 0);   \
+    b1.val = vget_v_##_T##m4_##_T##m1(val, 1);   \
+    b2.val = vget_v_##_T##m4_##_T##m1(val, 2);   \
+    b3.val = vget_v_##_T##m4_##_T##m1(val, 3);   \
 }
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
@@ -1125,13 +1308,13 @@ OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
 
 
 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
-inline _Tpvec operator << (const _Tpvec& a, int n) \
+inline _Tpvec v_shl(const _Tpvec& a, int n) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
 
 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
-inline _Tpvec operator >> (const _Tpvec& a, int n) \
+inline _Tpvec v_shr(const _Tpvec& a, int n) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
@@ -1167,25 +1350,28 @@ template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
 } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
 {     \
-        return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
+        suffix##m1_t res = vundefined_##_T##m1(); \
+        return _Tpvec(vslidedown_vx_##_T##m1(res, a.val, n, num));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
 { return a; } \
 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, a.val);          \
-    tmp = vset_##_T##m2(tmp, 1, b.val);          \
-        tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a.val);          \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, b.val);          \
+        res = vslidedown_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 0));\
 } \
 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
 { \
     suffix##m2_t tmp = vundefined_##_T##m2();    \
-    tmp = vset_##_T##m2(tmp, 0, b.val);    \
-    tmp = vset_##_T##m2(tmp, 1, a.val);    \
-        tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
-        return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
+    suffix##m2_t res = vundefined_##_T##m2();    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, b.val);    \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a.val);    \
+        res = vslideup_vx_##_T##m2(res, tmp, n, num2);\
+        return _Tpvec(vget_v_##_T##m2_##_T##m1(res, 1));\
 } \
 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -1203,50 +1389,132 @@ OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
 
-#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
+#if 1
+#define vreinterpret_v_i8m1_i8m1
+#define vreinterpret_v_u8m1_u8m1
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize, ldst_len, ldst_type) \
 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
 { \
-  typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
-  vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
-    return _Tpvec(_Tp2##_t(tmp)); } \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr0, 8)); \
+  res1 = vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr1, 8)); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
 inline _Tpvec v_load_low(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, hnum)); }\
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 8))); }\
 inline _Tpvec v_load_aligned(const _Tp* ptr) \
-{ return _Tpvec(vle_v_##len(ptr, num)); } \
+{ return _Tpvec(vreinterpret_v_##ldst_len##_##len(vle8_v_##ldst_len((ldst_type *)ptr, 16))); } \
 inline _Tpvec v_load(const _Tp* ptr) \
-{ return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, hnum);}\
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 8);}\
 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
 { \
-  _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
-  vse_v_##len(ptr, a0, hnum);}\
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a0), 8);}\
 inline void v_store(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
-{ vse_v_##len(ptr, a.val, num); } \
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); } \
 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
-{ vse_v_##len(ptr, a.val, num); }
+{ vse8_v_##ldst_len((ldst_type *)ptr, vreinterpret_v_##len##_##ldst_len(a.val), 16); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32, i8m1, schar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64, u8m1, uchar)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64, i8m1, schar)
+
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res = vundefined_##len(); \
+  _Tp2##_t res1 = vundefined_##len(); \
+  res = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr0, 8))); \
+  res1 = vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr1, 8))); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 8)))); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vreinterpret_v_u##elemsize##m1_##len(vreinterpret_v_u8m1_u##elemsize##m1(vle8_v_u8m1((uchar *)ptr, 16)))); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 8);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0 = vundefined_##len(); \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a0)), 8);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse8_v_u8m1((uchar *)ptr, vreinterpret_v_u##elemsize##m1_u8m1(vreinterpret_v_##len##_u##elemsize##m1(a.val)), 16); }
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_FLOAT_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
 
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
-OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
+#else
 
+#define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num, elemsize) \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+  _Tp2##_t res, res1; \
+  res = vle##elemsize##_v_##len(ptr0, hnum); \
+  res1 = vle##elemsize##_v_##len(ptr1, hnum); \
+  res = vslideup_vx_##len(res, res1, hnum, num); \
+  return _Tpvec(res); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, hnum)); }\
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(vle##elemsize##_v_##len(ptr, num)); } \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec((_Tp2##_t)vle##elemsize##_v_##len((const _Tp *)ptr, num)); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, hnum);}\
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+  _Tp2##_t a0; \
+  a0 = vslidedown_vx_##len(a0, a.val, hnum, num);    \
+  vse##elemsize##_v_##len(ptr, a0, hnum);}\
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ vse##elemsize##_v_##len(ptr, a.val, num); }
+
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16, 8)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8, 16)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2, 64)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4, 32)
+OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2, 64)
+
+#endif
 
 ////////////// Lookup table access ////////////////////
 
 inline v_int8x16 v_lut(const schar* tab, const int* idx)
 {
-#if 1
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[ 0]],
@@ -1266,16 +1534,18 @@ inline v_int8x16 v_lut(const schar* tab, const int* idx)
         tab[idx[14]],
         tab[idx[15]]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
 #else
-    int32xm4_t index32 = vlev_int32xm4(idx, 16);
-    vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
-    vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
-    return v_int8x16(vlxbv_i8m1(tab, index, 16));
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, vle32_v_u32m4((unsigned int *)idx, 16), 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, vle32_v_u32m4((unsigned int *)idx, 16), 16));
+#endif
 #endif
 }
 
 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1295,10 +1565,24 @@ inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
         tab[idx[7]],
         tab[idx[7] + 1]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 8);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 1, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    index = vadd_vv_u32m4(vand_vx_u32m4(seq, 1, 16), vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
 {
+#if 0
     schar CV_DECL_ALIGNED(32) elems[16] =
     {
         tab[idx[0]],
@@ -1318,7 +1602,23 @@ inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
         tab[idx[3] + 2],
         tab[idx[3] + 3]
     };
-    return v_int8x16(vle_v_i8m1(elems, 16));
+    return v_int8x16(vle8_v_i8m1(elems, 16));
+#else
+    vuint32m4_t seq, index;
+    vuint32m4_t vidx = vle32_v_u32m4((unsigned int *)idx, 4);
+    seq = vid_v_u32m4(16);
+    index = vsrl_vx_u32m4(seq, 2, 16);
+    vidx = vrgather_vv_u32m4(vidx, index, 16);
+    seq = vset_v_u32m1_u32m4(seq, 1, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 2, vget_v_u32m4_u32m1(seq, 0));
+    seq = vset_v_u32m1_u32m4(seq, 3, vget_v_u32m4_u32m1(seq, 0));
+    index = vadd_vv_u32m4(seq, vidx, 16);
+#if __riscv_v == 7000
+    return v_int8x16(vnclip_wx_i8m1(vnclip_wx_i16m2(vlxb_v_i32m4((const int *)tab, index, 16), 0, 16), 0, 16));
+#else
+    return v_int8x16(vloxei32_v_i8m1(tab, index, 16));
+#endif
+#endif
 }
 
 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
@@ -1327,6 +1627,7 @@ inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reint
 
 inline v_int16x8 v_lut(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1338,10 +1639,18 @@ inline v_int16x8 v_lut(const short* tab, const int* idx)
         tab[idx[6]],
         tab[idx[7]]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, vsll_vx_u32m2(vle32_v_u32m2((unsigned int *)idx, 8), 1, 8), 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1353,10 +1662,24 @@ inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
         tab[idx[3]],
         tab[idx[3] + 1]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 4);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 1, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    index = vsll_vx_u32m2(vadd_vv_u32m2(vand_vx_u32m2(seq, 1, 8), vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
 {
+#if 0
     short CV_DECL_ALIGNED(32) elems[8] =
     {
         tab[idx[0]],
@@ -1368,7 +1691,21 @@ inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
         tab[idx[1] + 2],
         tab[idx[1] + 3]
     };
-    return v_int16x8(vle_v_i16m1(elems, 8));
+    return v_int16x8(vle16_v_i16m1(elems, 8));
+#else
+    vuint32m2_t seq, index;
+    vuint32m2_t vidx = vle32_v_u32m2((unsigned int *)idx, 2);
+    seq = vid_v_u32m2(8);
+    index = vsrl_vx_u32m2(seq, 2, 8);
+    vidx = vrgather_vv_u32m2(vidx, index, 8);
+    seq = vset_v_u32m1_u32m2(seq, 1, vget_v_u32m2_u32m1(seq, 0));
+    index = vsll_vx_u32m2(vadd_vv_u32m2(seq, vidx, 8), 1, 8);
+#if __riscv_v == 7000
+    return v_int16x8(vnclip_wx_i16m1(vlxh_v_i32m2((const int *)tab, index, 8), 0, 8));
+#else
+    return v_int16x8(vloxei32_v_i16m1(tab, index, 8));
+#endif
+#endif
 }
 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
@@ -1376,6 +1713,7 @@ inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_rein
 
 inline v_int32x4 v_lut(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1383,10 +1721,14 @@ inline v_int32x4 v_lut(const int* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
 {
+#if 0
     int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1394,11 +1736,20 @@ inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1] + 1]
     };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    return v_int32x4(vle32_v_i32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_int32x4(vloxei32_v_i32m1(tab, index, 4));
+#endif
 }
 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
 {
-    return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
+    return v_int32x4(vle32_v_i32m1(tab+idx[0], 4));
 }
 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
@@ -1406,26 +1757,27 @@ inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_re
 
 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
 {
-    vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_int64x2(res);
+    //vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_int64x2(vloxei64_v_i64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
 {
-    return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
+    return v_int64x2(vle64_v_i64m1(tab+idx[0], 2));
 }
 
 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
 {
-    vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_uint64x2(res);
+    //vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_uint64x2(vloxei64_v_u64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
 {
-    return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
+    return v_uint64x2(vle64_v_u64m1(tab+idx[0], 2));
 }
 
 inline v_float32x4 v_lut(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1433,10 +1785,14 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
         tab[idx[2]],
         tab[idx[3]]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vle32_v_u32m1((unsigned int *)idx, 4), 2, 4), 4));
+#endif
 }
 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
 {
+#if 0
     float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idx[0]],
@@ -1444,69 +1800,79 @@ inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
         tab[idx[1]],
         tab[idx[1]+1]
     };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    return v_float32x4(vle32_v_f32m1(elems, 4));
+#else
+    vuint32m1_t seq, index;
+    vuint32m1_t vidx = vle32_v_u32m1((unsigned int *)idx, 2);
+    seq = vid_v_u32m1(4);
+    index = vsrl_vx_u32m1(seq, 1, 4);
+    vidx = vrgather_vv_u32m1(vidx, index, 4);
+    index = vsll_vx_u32m1(vadd_vv_u32m1(vand_vx_u32m1(seq, 1, 4), vidx, 4), 2, 4);
+    return v_float32x4(vloxei32_v_f32m1(tab, index, 4));
+#endif
 }
 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
 {
-    return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
+    return v_float32x4(vle32_v_f32m1(tab + idx[0], 4));
 }
 inline v_float64x2 v_lut(const double* tab, const int* idx)
 {
-    vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vget_v_u64m2_u64m1(vwaddu_vx_u64m2(vle32_v_u32m1((uint32_t*)idx, 2), 0, 2), 0), 3, 2), 2));
 }
 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
 {
-    return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
+    return v_float64x2(vle64_v_f64m1(tab+idx[0], 2));
 }
 
 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
 {
-    int CV_DECL_ALIGNED(32) elems[4] =
+    /*int CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_int32x4(vle_v_i32m1(elems, 4));
+    };*/
+    return v_int32x4(vloxei32_v_i32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
 {
-    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    /*unsigned CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_uint32x4(vle_v_u32m1(elems, 4));
+    };*/
+    return v_uint32x4(vloxei32_v_u32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 
 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
 {
-    float CV_DECL_ALIGNED(32) elems[4] =
+    /*float CV_DECL_ALIGNED(32) elems[4] =
     {
         tab[idxvec.val[0]],
         tab[idxvec.val[1]],
         tab[idxvec.val[2]],
         tab[idxvec.val[3]]
-    };
-    return v_float32x4(vle_v_f32m1(elems, 4));
+    };*/
+    return v_float32x4(vloxei32_v_f32m1(tab, vsll_vx_u32m1(vreinterpret_v_i32m1_u32m1(idxvec.val), 2, 4), 4));
 }
 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
 {
-    vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
-    return v_float64x2(res);
+    //vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
+    return v_float64x2(vloxei64_v_f64m1(tab, vsll_vx_u64m1(vreinterpret_v_i64m1_u64m1(vget_v_i64m2_i64m1(vwadd_vx_i64m2(idxvec.val, 0, 2), 0)), 3, 2), 2));
 }
 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
 {
-    vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
-    vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
+    vint32m1_t index = vmul_vx_i32m1(idxvec.val, 4, 4);
+    //vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
 
-    x.val = vlxe_v_f32m1(tab, index_x, 4);
-    y.val = vlxe_v_f32m1(tab, index_y, 4);
+    //x.val = vlxe_v_f32m1(tab, index_x, 4);
+    //y.val = vlxe_v_f32m1(tab, index_y, 4);
+    vloxseg2ei32_v_f32m1(&x.val, &y.val, tab, vreinterpret_v_i32m1_u32m1(index), 4);
 }
 
 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
@@ -1518,52 +1884,52 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
+#define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type, elemsize) \
 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
 }\
 template<int n> inline \
 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
 { \
     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, b.val);    \
     return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
 }\
 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
     asm("" ::: "memory");                                       \
-    vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
+    vse##elemsize##_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
 }\
 template<int n> inline \
 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
 { \
     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
-    tmp = vset_##_T2##m2(tmp, 0, a.val);    \
-    tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
-    vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##_T2##m1_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
+    vse##elemsize##_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
 }
-OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
-OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_wx, vnclip_wx, signed char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_wx, vnclip_wx, signed short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_wx, vnsra_wx, int, 32)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_wx, vnclipu_wx, unsigned char, 8)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_wx, vnclipu_wx, unsigned short, 16)
+OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_wx, vnsrl_wx, unsigned int, 32)
 
 // pack boolean
 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
 {
     vuint16m2_t tmp = vundefined_u16m2();    \
-    tmp = vset_u16m2(tmp, 0, a.val);    \
-    tmp = vset_u16m2(tmp, 1, b.val);    \
-    return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
+    tmp = vset_v_u16m1_u16m2(tmp, 0, a.val);    \
+    tmp = vset_v_u16m1_u16m2(tmp, 1, b.val);    \
+    return v_uint8x16(vnsrl_wx_u8m1(tmp, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
@@ -1571,12 +1937,12 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
 {
     vuint32m4_t vabcd = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    vabcd = vset_u32m4(vabcd, 0, a.val);    \
-    vabcd = vset_u32m4(vabcd, 1, b.val);    \
-    vabcd = vset_u32m4(vabcd, 2, c.val);    \
-    vabcd = vset_u32m4(vabcd, 3, d.val);    \
-    v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    vabcd = vset_v_u32m1_u32m4(vabcd, 0, a.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 1, b.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 2, c.val);    \
+    vabcd = vset_v_u32m1_u32m4(vabcd, 3, d.val);    \
+    v16 = vnsrl_wx_u16m2(vabcd, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
@@ -1586,17 +1952,17 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     vuint64m8_t v64 = vundefined_u64m8();    \
     vuint32m4_t v32 = vundefined_u32m4();    \
     vuint16m2_t v16 = vundefined_u16m2();    \
-    v64 = vset_u64m8(v64, 0, a.val);    \
-    v64 = vset_u64m8(v64, 1, b.val);    \
-    v64 = vset_u64m8(v64, 2, c.val);    \
-    v64 = vset_u64m8(v64, 3, d.val);    \
-    v64 = vset_u64m8(v64, 4, e.val);    \
-    v64 = vset_u64m8(v64, 5, f.val);    \
-    v64 = vset_u64m8(v64, 6, g.val);    \
-    v64 = vset_u64m8(v64, 7, h.val);    \
-    v32 = vnsrl_vx_u32m4(v64, 0, 16);
-    v16 = vnsrl_vx_u16m2(v32, 0, 16);
-    return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
+    v64 = vset_v_u64m1_u64m8(v64, 0, a.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 1, b.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 2, c.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 3, d.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 4, e.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 5, f.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 6, g.val);    \
+    v64 = vset_v_u64m1_u64m8(v64, 7, h.val);    \
+    v32 = vnsrl_wx_u32m4(v64, 0, 16);
+    v16 = vnsrl_wx_u16m2(v32, 0, 16);
+    return v_uint8x16(vnsrl_wx_u8m1(v16, 0, 16));
 }
 
 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
@@ -1612,63 +1978,54 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1));    \
 } \
 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), 0, num1), num2);    \
 } \
 template<int n> inline \
 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
-    tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 1, b.val);    \
     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
+    return v_uint##tp1##x##num1(vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val), n, num1));    \
 } \
 template<int n> inline \
 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
 { \
     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
-    tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
+    tmp = vset_v_##i##tp2##m1_##i##tp2##m2(tmp, 0, a.val);    \
     vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
-    vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
-    return vse_v_u##tp1##m1(ptr, val, num2);\
+    vuint##tp1##m1_t val = vnclipu_wx_u##tp1##m1(vreinterpret_v_i##tp2##m2_u##tp2##m2(val_), n, num1);    \
+    return vse##tp1##_v_u##tp1##m1(ptr, val, num2);\
 }
 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
 
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
 
 // saturating multiply 8-bit, 16-bit
-#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
-    {                                                            \
-        _Tpwvec c, d;                                            \
-        v_mul_expand(a, b, c, d);                                \
-        return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
-
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
-OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
+#define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, num, mul, cvt)   \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)       \
+    {                                                           \
+        auto res = mul(a.val, b.val, num);                      \
+        return _Tpvec(cvt(res, 0, num));                        \
+    }
+
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  16, vwmul_vv_i16m2, vnclip_wx_i8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, 16, vwmulu_vv_u16m2, vnclipu_wx_u8m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  32, vwmul_vv_i32m2, vnclip_wx_i16m1)
+OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, 32, vwmulu_vv_u32m2, vnclipu_wx_u16m1)
+
+
 static const signed char popCountTable[256] =
 {
     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
@@ -1690,8 +2047,12 @@ static const signed char popCountTable[256] =
 };
 
 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
-    vuint8m1_t v0 = val & 1;
-    return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
+#if __riscv_v == 7000
+    vuint8m1_t v0 = vand_vx_u8m1(val, 1, 16);
+    return vadd_vv_u8m1(vloxei8_v_u8m1((unsigned char*)popCountTable, vsrl_vx_u8m1(val, 1, 16), 16), v0, 16);
+#else
+    return vloxei8_v_u8m1((unsigned char*)popCountTable, val, 16);
+#endif
 }
 
 inline v_uint8x16
@@ -1703,156 +2064,138 @@ v_popcount(const v_uint8x16& a)
 inline v_uint8x16
 v_popcount(const v_int8x16& a)
 {
-    return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
+    return v_uint8x16(vcnt_u8(vreinterpret_v_i8m1_u8m1(a.val)));
 }
 
 inline v_uint16x8
 v_popcount(const v_uint16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u16m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint16x8
 v_popcount(const v_int16x8& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
-    return v_uint16x8(vget_u16m2_u16m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    return v_uint16x8(vget_v_u16m2_u16m1(vwaddu_vv_u16m2(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_uint32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u32m1_u8m1(a.val));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint32x4
 v_popcount(const v_int32x4& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
-                     0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
-    vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
-    return v_uint32x4(vget_u32m2_u32m1(res, 0));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(a.val)));
+    vuint8m1_t seq = vid_v_u8m1(8);
+    vuint8m1_t index = vsll_vx_u8m1(seq, 1, 8);
+    vuint8m1_t sum = vadd_vv_u8m1(vrgather_vv_u8m1(tmp, index, 8), vrgather_vv_u8m1(tmp, vadd_vx_u8m1(index, 1, 8), 8), 8);
+    return v_uint32x4(vget_v_u32m4_u32m1(vwaddu_vx_u32m4(vwaddu_vv_u16m2(vrgather_vv_u8m1(sum, index, 4), vrgather_vv_u8m1(sum, vadd_vx_u8m1(index, 1, 4), 4), 4), 0, 4), 0));
 }
 
 inline v_uint64x2
 v_popcount(const v_uint64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_u64m1_u8m1(a.val));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1();
+    vuint16m1_t res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 inline v_uint64x2
 v_popcount(const v_int64x2& a)
 {
-    vuint8m2_t tmp = vundefined_u8m2();
-    tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
-    vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
-                     0x0F0E0D0C0B0A0908, 0x0000000000000000};
-    tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
-    vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
-    vuint8m1_t res1 = zero;
-    vuint8m1_t res2 = zero;
-    res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
-    res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
-
-    return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
+    vuint8m1_t tmp = vcnt_u8(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i64m1_i8m1(a.val)));
+    vuint16m2_t tmp16 = vwaddu_vx_u16m2(tmp, 0, 16);
+    vuint16m1_t res1 = vundefined_u16m1(), res2 = vundefined_u16m1();
+    res1 = vredsum_vs_u16m1_u16m1(res1, vget_v_u16m2_u16m1(tmp16, 0), vmv_v_x_u16m1(0, 8), 8);
+    res2 = vredsum_vs_u16m1_u16m1(res2, vget_v_u16m2_u16m1(tmp16, 1), vmv_v_x_u16m1(0, 8), 8);
+    return v_uint64x2((unsigned long)vmv_x_s_u16m1_u16(res1), (unsigned long)vmv_x_s_u16m1_u16(res2));
 }
 
 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
 inline int v_signmask(const v_uint8x16& a)
 {
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
     vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
-    vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
-    vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
-    vuint32m1_t res = vmv_v_x_u32m1(0, 4);
-    vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
-    res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
-    res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
-    return vmv_x_s_u32m1_u32(res, 8);
+    vbool8_t mask = vmseq_vx_u8m1_b8(t0, 1, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int8x16& a)
 {
-    vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
-    vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
-    vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
-    res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
-    res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
-    return vmv_x_s_i32m1_i32(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint8m1_t id = vid_v_u8m1(16);
+    vuint16m2_t num = vsll_vv_u16m2(vmv_v_x_u16m2(1, 16), vwaddu_vx_u16m2(id, 0, 16), 16);
+    vbool8_t mask = vmslt_vx_i8m1_b8(a.val, 0, 16);
+    res = vredsum_vs_u16m2_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 
 inline int v_signmask(const v_int16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vbool16_t mask = vmslt_vx_i16m1_b16(a.val, 0, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 16);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_uint16x8& a)
 {
-    vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
-    vint16m1_t m1 = (vint16m1_t){SMASK};
-    vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
-    vint16m1_t res = vmv_v_x_i16m1(0, 8);
-    res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
-    return vmv_x_s_i16m1_i16(res, 8);
+    vuint16m1_t res = vundefined_u16m1();
+    vuint16m1_t id = vid_v_u16m1(8);
+    vuint16m1_t num = vsll_vv_u16m1(vmv_v_x_u16m1(1, 8), id, 8);
+    vuint16m1_t t0  = vsrl_vx_u16m1(a.val, 15, 8);
+    vbool16_t mask = vmseq_vx_u16m1_b16(t0, 1, 8);
+    res = vredsum_vs_u16m1_u16m1_m(mask, res, num, vmv_v_x_u16m1(0, 8), 8);
+    return vmv_x_s_u16m1_u16(res);
 }
 inline int v_signmask(const v_int32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmslt_vx_i32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    vuint32m1_t res = vundefined_u32m1();
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vuint32m1_t t0  = vsrl_vx_u32m1(a.val, 31, 4);
+    vbool32_t mask = vmseq_vx_u32m1_b32(t0, 1, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);
 }
 inline int v_signmask(const v_uint64x2& a)
 {
-    vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
-    int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
-    return res;
+    vuint64m1_t res = vundefined_u64m1();
+    vuint64m1_t id = vid_v_u64m1(2);
+    vuint64m1_t num = vsll_vv_u64m1(vmv_v_x_u64m1(1, 2), id, 2);
+    vuint64m1_t t0  = vsrl_vx_u64m1(a.val, 63, 2);
+    vbool64_t mask = vmseq_vx_u64m1_b64(t0, 1, 2);
+    res = vredsum_vs_u64m1_u64m1_m(mask, res, num, vmv_v_x_u64m1(0, 2), 2);
+    return vmv_x_s_u64m1_u64(res);
 }
 inline int v_signmask(const v_int64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
@@ -1860,12 +2203,14 @@ inline int v_signmask(const v_float64x2& a)
 { return v_signmask(v_reinterpret_as_u64(a)); }
 inline int v_signmask(const v_float32x4& a)
 {
-    vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
-    vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
-    vint32m1_t res = vmv_v_x_i32m1(0, 4);
-    vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
-    res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
-    return vmv_x_s_i32m1_i32(res, 4);
+    return v_signmask(v_reinterpret_as_u32(a));
+    /*
+    vuint32m1_t res;
+    vuint32m1_t id = vid_v_u32m1(4);
+    vuint32m1_t num = vsll_vv_u32m1(vmv_v_x_u32m1(1, 4), id, 4);
+    vbool32_t mask = vmflt_vf_f32m1_b32(a.val, 0, 4);
+    res = vredsum_vs_u32m1_u32m1_m(mask, res, num, vmv_v_x_u32m1(0, 4), 4);
+    return vmv_x_s_u32m1_u32(res);*/
 }
 
 inline int v_scan_forward(const v_int8x16& a) {
@@ -1905,24 +2250,22 @@ int val = v_signmask(a);
 if(val==0) return 0;
 else return trailingZeros32(val); }
 
-#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
+#define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num, mask_b) \
 inline bool v_check_all(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
-    vuint64m1_t v1 = vuint64m1_t(v0); \
-    return (v1[0] | v1[1]) == 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) == 0; \
 } \
 inline bool v_check_any(const v_##_Tpvec& a) \
 { \
     suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
-    vuint64m1_t v1 = vuint64m1_t(v0); \
-    return (v1[0] | v1[1]) != 0; \
+    return (vcpop_m_##mask_b(vmseq_vx_##_T##_##mask_b(v0, 1, num), num)) != 0; \
 }
 
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
-OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16, b8)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8, b16)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4, b32)
+OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2, b64)
 
 inline bool v_check_all(const v_int8x16& a)
 { return v_check_all(v_reinterpret_as_u8(a)); }
@@ -1950,97 +2293,93 @@ inline bool v_check_any(const v_int64x2& a)
 inline bool v_check_any(const v_float64x2& a)
 { return v_check_any(v_reinterpret_as_u64(a)); }
 
-#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
+#define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num, mask_func) \
 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 { \
-    return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
+    return _Tpvec(vmerge_vvm_##suffix(mask_func(mask.val, 0, num), b.val, a.val, num)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
-OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16, vmsne_vx_i8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8, vmsne_vx_i16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4, vmsne_vx_i32m1_b32)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16, vmsne_vx_u8m1_b8)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8, vmsne_vx_u16m1_b16)
+OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4, vmsne_vx_u32m1_b32)
 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
 {
-    return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
+    return v_float32x4(vmerge_vvm_f32m1(vmfne_vf_f32m1_b32(mask.val, 0, 4), b.val, a.val, 4));
 }
 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
 {
-    return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
+    return v_float64x2(vmerge_vvm_f64m1(vmfne_vf_f64m1_b64(mask.val, 0, 2), b.val, a.val, 2));
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
+#define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2, num3) \
 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
-    b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    b0.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 0);  \
+    b1.val = vget_v_##_Tp2##m2_##_Tp2##m1(b, 1);  \
 } \
 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 } \
 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
 { \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(a.val, 0, num1);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 1)); \
 } \
 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
 { \
-    _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
-    _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
-    return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
+    _T2##_t val = vle##num3##_v_##_Tp1(ptr, num2);    \
+    _T1##_t b = vw##add##_vx_##_Tp2##m2(val, 0, num2);    \
+    return v_##_Tpwvec(vget_v_##_Tp2##m2_##_Tp2##m1(b, 0)); \
 }
 
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
-OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1, 32)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1, 8)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1, 16)
+OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1, 32)
 
 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
 {
     vuint16m2_t b = vundefined_u16m2();
     vuint32m2_t c = vundefined_u32m2();
-    vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
+    vuint8m1_t val = vle8_v_u8m1(ptr, 4);    \
     b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
-    c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
-    return v_uint32x4(vget_u32m2_u32m1(c, 0));
+    c = vwaddu_vv_u32m2(vget_v_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
+    return v_uint32x4(vget_v_u32m2_u32m1(c, 0));
 }
 
 inline v_int32x4 v_load_expand_q(const schar* ptr)
 {
     vint16m2_t b = vundefined_i16m2();
     vint32m2_t c = vundefined_i32m2();
-    vint8m1_t val = vle_v_i8m1(ptr, 4);    \
+    vint8m1_t val = vle8_v_i8m1(ptr, 4);    \
     b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
-    c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
-    return v_int32x4(vget_i32m2_i32m1(c, 0));
-}
-#define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
-#define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
-#define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
-#define VITL_2 (vuint64m2_t){0, 2, 1, 3}
-#define LOW_4  0x0000000100000000, 0x0000000500000004
-#define LOW_8  0x0003000200010000, 0x000B000A00090008
-#define LOW_16 0x0706050403020100, 0x1716151413121110
-#define HIGH_4  0x0000000300000002, 0x0000000700000006
-#define HIGH_8  0x0007000600050004, 0x000F000E000D000C
-#define HIGH_16 0x0F0E0D0C0B0A0908,  0x1F1E1D1C1B1A1918
-#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
+    c = vwadd_vv_i32m2(vget_v_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
+    return v_int32x4(vget_v_i32m2_i32m1(c, 0));
+}
+#define VITL_16 {0x11011000, 0x13031202, 0x15051404, 0x17071606, 0x19091808, 0x1B0B1A0A, 0x1D0D1C0C, 0x1F0F1E0E}
+#define VITL_8 {0x00080000, 0x00090001, 0x000A0002, 0x000B0003, 0x000C0004, 0x000D0005, 0x000E0006, 0x000F0007}
+#define VITL_4 {0x00000000, 0x00000004, 0x00000001, 0x00000005, 0x00000002, 0x00000006, 0x00000003, 0x00000007}
+#define VITL_2 {0, 0, 2, 0, 1, 0, 3, 0}
+
+#define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh, refunc) \
 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
 { \
     v##_Tp##m2_t tmp = vundefined_##_T##m2();\
-    tmp = vset_##_T##m2(tmp, 0, a0.val); \
-    tmp = vset_##_T##m2(tmp, 1, a1.val); \
-    vuint64m2_t mask = VITL_##num;    \
-    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
-    b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
-    b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 0, a0.val); \
+    tmp = vset_v_##_T##m1_##_T##m2(tmp, 1, a1.val); \
+    unsigned mdata[] = VITL_##num; \
+    vuint32m2_t mask = vle32_v_u32m2(mdata, 8);    \
+    tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, refunc(mask), num2);    \
+    b0.val = vget_v_##_T##m2_##_T##m1(tmp, 0); \
+    b1.val = vget_v_##_T##m2_##_T##m1(tmp, 1); \
 } \
 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
@@ -2049,58 +2388,59 @@ inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 } \
 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
 { \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
-    v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t b1 = vundefined_##_T##m1(); \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
+    b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
     return v_##_Tpvec(b1);\
 } \
 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
 { \
+    v##_Tp##m1_t b0 = vundefined_##_T##m1(); \
+    v##_Tp##m1_t a0 = vundefined_##_T##m1(); \
     c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
-    v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
-    v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
+    b0 = vslidedown_vx_##_T##m1(b0, b.val, numh, num);    \
+    a0 = vslidedown_vx_##_T##m1(a0, a.val, numh, num);    \
     d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
 }
 
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
-OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8, vreinterpret_v_u32m2_u8m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4, vreinterpret_v_u32m2_u16m2)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2,)
+OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1, vreinterpret_v_u32m2_u64m2)
 
 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_uint8x16(vrgather_vv_u8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 inline v_int8x16 v_reverse(const v_int8x16 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
-    return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
+    return v_int8x16(vrgather_vv_i8m1(a.val, vrsub_vx_u8m1(vid_v_u8m1(16), 15, 16), 16));
 }
 
 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
 {
-    vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_uint16x8(vrgather_vv_u16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 
 inline v_int16x8 v_reverse(const v_int16x8 &a)
 {
-    vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
-    return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
+    return v_int16x8(vrgather_vv_i16m1(a.val, vrsub_vx_u16m1(vid_v_u16m1(8), 7, 8), 8));
 }
 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
 {
-    return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_uint32x4(vrgather_vv_u32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_int32x4 v_reverse(const v_int32x4 &a)
 {
-    return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
+    return v_int32x4(vrgather_vv_i32m1(a.val, vrsub_vx_u32m1(vid_v_u32m1(4), 3, 4), 4));
 }
 
 inline v_float32x4 v_reverse(const v_float32x4 &a)
@@ -2108,17 +2448,17 @@ inline v_float32x4 v_reverse(const v_float32x4 &a)
 
 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
 {
-    return v_uint64x2(a.val[1], a.val[0]);
+    return v_uint64x2(vrgather_vv_u64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_int64x2 v_reverse(const v_int64x2 &a)
 {
-    return v_int64x2(a.val[1], a.val[0]);
+    return v_int64x2(vrgather_vv_i64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 inline v_float64x2 v_reverse(const v_float64x2 &a)
 {
-    return v_float64x2(a.val[1], a.val[0]);
+    return v_float64x2(vrgather_vv_f64m1(a.val, vrsub_vx_u64m1(vid_v_u64m1(2), 1, 2), 2));
 }
 
 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
@@ -2137,19 +2477,19 @@ OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
 
 
-#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
-template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
+#define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix, vtype, _vtype, num, mvfunc) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { vtype tmp = vundefined_##_vtype(); return mvfunc(vslidedown_vx_##_vtype(tmp, v.val, i, num)); }
 
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
-OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8, vuint8m1_t, u8m1, 16, vmv_x_s_u8m1_u8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8, vint8m1_t, i8m1, 16, vmv_x_s_i8m1_i8)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16, vuint16m1_t, u16m1, 8, vmv_x_s_u16m1_u16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16, vint16m1_t, i16m1, 8, vmv_x_s_i16m1_i16)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32, vuint32m1_t, u32m1, 4, vmv_x_s_u32m1_u32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32, vint32m1_t, i32m1, 4, vmv_x_s_i32m1_i32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64, vuint64m1_t, u64m1, 2, vmv_x_s_u64m1_u64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64, vint64m1_t, i64m1, 2, vmv_x_s_i64m1_i64)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32, vfloat32m1_t, f32m1, 4, vfmv_f_s_f32m1_f32)
+OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64, vfloat64m1_t, f64m1, 2, vfmv_f_s_f64m1_f64)
 
 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
@@ -2163,10 +2503,24 @@ OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
+
+inline void __builtin_riscv_fsrm(int val)
+{
+    asm("csrw frm, %0\n\t"
+        :
+        :"r"(val));
+    return;
+}
+
+inline void barrier1(void *arg) {
+  __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
 inline v_int32x4 v_round(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(0);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2175,7 +2529,8 @@ inline v_int32x4 v_round(const v_float32x4& a)
 inline v_int32x4 v_floor(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(2);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2185,7 +2540,8 @@ inline v_int32x4 v_floor(const v_float32x4& a)
 inline v_int32x4 v_ceil(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(3);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2195,7 +2551,8 @@ inline v_int32x4 v_ceil(const v_float32x4& a)
 inline v_int32x4 v_trunc(const v_float32x4& a)
 {
     __builtin_riscv_fsrm(1);
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(a.val), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
     __builtin_riscv_fsrm(0);
@@ -2206,10 +2563,11 @@ inline v_int32x4 v_round(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
     //_val = vset_f64m2(_val, 1, a.val);
-    _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2217,9 +2575,10 @@ inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
 {
     __builtin_riscv_fsrm(0);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    barrier1(&_val);
+    vint32m1_t val = vfncvt_x_f_w_i32m1(_val, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
@@ -2227,10 +2586,10 @@ inline v_int32x4 v_floor(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(2);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2241,10 +2600,10 @@ inline v_int32x4 v_ceil(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(3);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
@@ -2255,139 +2614,86 @@ inline v_int32x4 v_trunc(const v_float64x2& a)
 {
     __builtin_riscv_fsrm(1);
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
-
-    vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
+    vint32m1_t nan = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(aval), 0x7f800000, 4);
+    barrier1(&nan);
     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
     __builtin_riscv_fsrm(0);
     return v_int32x4(val);
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
 { \
-    v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
+    intrin##2e##elemsize##_v_##_T##m1(&a.val, &b.val, ptr, num); \
 } \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
 { \
-    v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
+    intrin##3e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num); \
 }\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
 { \
-    v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
+    intrin##4e##elemsize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num); \
 } \
 
-#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
+#define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T, elemsize)    \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    intrin##2e_v_##_T##m1x2(ptr, ret, num); \
+    intrin##2e##elemsize##_v_##_T##m1(ptr, a.val, b.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
 { \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    intrin##3e_v_##_T##m1x3(ptr, ret, num); \
+    intrin##3e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, num); \
 } \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
 { \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    intrin##4e_v_##_T##m1x4(ptr, ret, num); \
+    intrin##4e##elemsize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num); \
 }
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
-OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
-OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T, elemsize) \
+OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T, elemsize)    \
+OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T, elemsize)
 
 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32, 32)
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8, 8)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16, 16)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32, 32)
 
-#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
+#define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T, _esize) \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
-{ \
-    v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
-    a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
-} \
+{ vlseg2e##_esize##_v_##_T##m1(&a.val, &b.val, ptr, num);} \
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
-{ \
-    v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
-    a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
-}\
+{ vlseg3e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, ptr, num);}\
 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
-{ \
-    v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
-    a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
-    b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
-    c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
-    d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
-} \
+{ vlseg4e##_esize##_v_##_T##m1(&a.val, &b.val, &c.val, &d.val, ptr, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
-    ret = vset_##_T##m1x2(ret, 0, a.val);  \
-    ret = vset_##_T##m1x2(ret, 1, b.val);  \
-    vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
-} \
+{ vsseg2e##_esize##_v_##_T##m1(ptr, a.val, b.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
-{ \
-    v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
-    ret = vset_##_T##m1x3(ret, 0, a.val);  \
-    ret = vset_##_T##m1x3(ret, 1, b.val);  \
-    ret = vset_##_T##m1x3(ret, 2, c.val);  \
-    vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
-} \
+{ vsseg3e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, num);} \
 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
-{ \
-    v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
-    ret = vset_##_T##m1x4(ret, 0, a.val);  \
-    ret = vset_##_T##m1x4(ret, 1, b.val);  \
-    ret = vset_##_T##m1x4(ret, 2, c.val);  \
-    ret = vset_##_T##m1x4(ret, 3, d.val);  \
-    vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
-}
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
+{ vsseg4e##_esize##_v_##_T##m1(ptr, a.val, b.val, c.val, d.val, num);}
 
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
-OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32, 32)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64, 64)
+
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64, 64)
+OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64, 64)
 
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 {
@@ -2398,17 +2704,17 @@ inline v_float32x4 v_cvt_f32(const v_int32x4& a)
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 2);
     return v_float32x4(aval);
 }
 
 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
 {
     vfloat64m2_t _val = vundefined_f64m2();
-    _val = vset_f64m2(_val, 0, a.val);
-    _val = vset_f64m2(_val, 1, b.val);
-    vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
+    _val = vset_v_f64m1_f64m2(_val, 0, a.val);
+    _val = vset_v_f64m1_f64m2(_val, 1, b.val);
+    vfloat32m1_t aval = vfncvt_f_f_w_f32m1(_val, 4);
     return v_float32x4(aval);
 }
 
@@ -2416,26 +2722,26 @@ inline v_float64x2 v_cvt_f64(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
 {
     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 0));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 0));
 }
 
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
 {
     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
-    return v_float64x2(vget_f64m2_f64m1(_val, 1));
+    return v_float64x2(vget_v_f64m2_f64m1(_val, 1));
 }
 
 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
@@ -2446,8 +2752,9 @@ inline v_float64x2 v_cvt_f64(const v_int64x2& a)
 #endif
 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0705060403010200, 0x0F0D0E0C0B090A08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 {
@@ -2456,8 +2763,9 @@ inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
 
 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
-    return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vrgather_vv_i8m1(vec.val, vreinterpret_v_u64m1_u8m1(m0), 16));
 }
 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 {
@@ -2466,35 +2774,40 @@ inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
 
 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A030209080100, 0x0F0E07060D0C0504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
 {
-    vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
-    return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0B0A090803020100, 0x0F0E0D0C07060504};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int32x4(vreinterpret_v_i8m1_i32m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i32m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
 {
-    vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
-    return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int8x16(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vec.val), vreinterpret_v_u64m1_u8m1(m0), 16)));
 }
 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 {
-    vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
-    return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
+    uint64 mdata[2] = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
+    vuint64m1_t m0 = vle64_v_u64m1(mdata, 2);
+    return v_int16x8(vreinterpret_v_i8m1_i16m1(vreinterpret_v_u8m1_i8m1(vrgather_vv_u8m1(vreinterpret_v_i8m1_u8m1(vreinterpret_v_i16m1_i8m1(vec.val)), vreinterpret_v_u64m1_u8m1(m0), 16))));
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
@@ -2507,36 +2820,66 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
                                     const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
 {
     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
-    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
+    vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_v_i64m2_i64m1(v1, 0), vget_v_i64m2_i64m1(v1, 1), 2), 2);
     return v_float64x2(res);
 }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
 { v_float64x2 res = v_dotprod_expand_fast(a, b);
-  return res + c; }
+  return v_add(res, c); }
 #endif
 ////// FP16 support ///////
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+#if __riscv_v == 7000
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
-    vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
+    vfloat16m1_t v = vle16_v_f16m1((__fp16*)ptr, 4);
     vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
-    return v_float32x4(vget_f32m2_f32m1(v32, 0));
+    return v_float32x4(vget_v_f32m2_f32m1(v32, 0));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     vfloat32m2_t v32 = vundefined_f32m2();
-    v32 = vset_f32m2(v32, 0, v.val);
-    vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
-    vse_v_f16m1((__fp16*)ptr, hv, 4);
+    v32 = vset_v_f32m1_f32m2(v32, 0, v.val);
+    vfloat16m1_t hv = vfncvt_f_f_w_f16m1(v32, 4);
+    vse16_v_f16m1((__fp16*)ptr, hv, 4);
+}
+#else
+inline v_float32x4 v_load_expand(const hfloat* ptr)
+{
+    vfloat16mf2_t v = vle16_v_f16mf2((__fp16*)ptr, 4);
+    vfloat32m1_t v32 = vfwcvt_f_f_v_f32m1(v, 4);
+    return v_float32x4(v32);
 }
 
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
+{
+    //vfloat32m2_t v32 = vundefined_f32m2();
+    //v32 = vset_f32m2(v32, 0, v.val);
+    vfloat16mf2_t hv = vfncvt_f_f_w_f16mf2(v.val, 4);
+    vse16_v_f16mf2((__fp16*)ptr, hv, 4);
+}
+#endif
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_rvv_scalable.hpp b/3rdParty/opencv2/core/hal/intrin_rvv_scalable.hpp
new file mode 100644
index 0000000000..b6ce2d7f47
--- /dev/null
+++ b/3rdParty/opencv2/core/hal/intrin_rvv_scalable.hpp
@@ -0,0 +1,2153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// The original implementation is contributed by HAN Liutong.
+// Copyright (C) 2022, Institute of Software, Chinese Academy of Sciences.
+
+#ifndef OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+#define OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
+
+#include <opencv2/core/check.hpp>
+
+#if defined(__GNUC__) && !defined(__clang__)
+// FIXIT: eliminate massive warnigs from templates
+// GCC from 'rvv-next': riscv64-unknown-linux-gnu-g++ (g42df3464463) 12.0.1 20220505 (prerelease)
+// doesn't work: #pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+#ifndef CV_RVV_MAX_VLEN
+#define CV_RVV_MAX_VLEN 1024
+#endif
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD_SCALABLE 1
+#define CV_SIMD_SCALABLE_64F 1
+
+using v_uint8 = vuint8m2_t;
+using v_int8 = vint8m2_t;
+using v_uint16 = vuint16m2_t;
+using v_int16 = vint16m2_t;
+using v_uint32 = vuint32m2_t;
+using v_int32 = vint32m2_t;
+using v_uint64 = vuint64m2_t;
+using v_int64 = vint64m2_t;
+
+using v_float32 = vfloat32m2_t;
+#if CV_SIMD_SCALABLE_64F
+using v_float64 = vfloat64m2_t;
+#endif
+
+using uchar = unsigned char;
+using schar = signed char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint64 = unsigned long int;
+using int64 = long int;
+
+
+template <class T>
+struct VTraits;
+
+#define OPENCV_HAL_IMPL_RVV_TRAITS(REG, TYP, SUF, SZ) \
+template <> \
+struct VTraits<REG> \
+{ \
+    static inline int vlanes() { return __riscv_vsetvlmax_##SUF(); } \
+    using lane_type = TYP; \
+    static const int max_nlanes = CV_RVV_MAX_VLEN/SZ; \
+};
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m1_t, int8_t, e8m1, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m2_t, int8_t, e8m2, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m4_t, int8_t, e8m4, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint8m8_t, int8_t, e8m8, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m1_t, uint8_t, e8m1, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m2_t, uint8_t, e8m2, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m4_t, uint8_t, e8m4, 8)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint8m8_t, uint8_t, e8m8, 8)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m1_t, int16_t, e16m1, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m2_t, int16_t, e16m2, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m4_t, int16_t, e16m4, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint16m8_t, int16_t, e16m8, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m1_t, uint16_t, e16m1, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m2_t, uint16_t, e16m2, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m4_t, uint16_t, e16m4, 16)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint16m8_t, uint16_t, e16m8, 16)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m1_t, int32_t, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m2_t, int32_t, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m4_t, int32_t, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint32m8_t, int32_t, e32m8, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m1_t, uint32_t, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m2_t, uint32_t, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m4_t, uint32_t, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint32m8_t, uint32_t, e32m8, 32)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m1_t, int64_t, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m2_t, int64_t, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m4_t, int64_t, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vint64m8_t, int64_t, e64m8, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m1_t, uint64_t, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m2_t, uint64_t, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m4_t, uint64_t, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vuint64m8_t, uint64_t, e64m8, 64)
+
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m1_t, float, e32m1, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m2_t, float, e32m2, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m4_t, float, e32m4, 32)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat32m8_t, float, e32m8, 32)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m1_t, double, e64m1, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m2_t, double, e64m2, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m4_t, double, e64m4, 64)
+OPENCV_HAL_IMPL_RVV_TRAITS(vfloat64m8_t, double, e64m8, 64)
+#endif
+
+
+// LLVM/Clang defines "overloaded intrinsics" e.g. 'vand(op1, op2)'
+// GCC does not have these functions, so we need to implement them manually
+// We implement only selected subset required to build current state of the code
+// Included inside namespace cv::
+// #ifndef __riscv_v_intrinsic_overloading
+// #include "intrin_rvv_compat_overloaded.hpp"
+// #endif // __riscv_v_intrinsic_overloading
+
+
+//////////// get0 ////////////
+#define OPENCV_HAL_IMPL_RVV_GRT0_INT(_Tpvec, _Tp) \
+inline _Tp v_get0(const v_##_Tpvec& v) \
+{ \
+    return __riscv_vmv_x(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint8, uchar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int8, schar)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint16, ushort)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int16, short)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int32, int)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(uint64, uint64)
+OPENCV_HAL_IMPL_RVV_GRT0_INT(int64, int64)
+
+inline float v_get0(const v_float32& v) \
+{ \
+    return __riscv_vfmv_f(v); \
+}
+#if CV_SIMD_SCALABLE_64F
+inline double v_get0(const v_float64& v) \
+{ \
+    return __riscv_vfmv_f(v); \
+}
+#endif
+
+//////////// Initial ////////////
+
+#define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, suffix1, suffix2, vl) \
+inline v_##_Tpvec v_setzero_##suffix1() \
+{ \
+    return __riscv_vmv_v_x_##suffix2##m2(0, vl); \
+} \
+inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
+{ \
+    return __riscv_vmv_v_x_##suffix2##m2(v, vl); \
+} \
+template <> inline v_##_Tpvec v_setzero_() \
+{ \
+    return v_setzero_##suffix1(); \
+} \
+template <> inline v_##_Tpvec v_setall_(_Tp v) \
+{ \
+    return v_setall_##suffix1(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8, uchar, u8, u8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8, schar, s8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16, ushort, u16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16, short, s16, i16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32, uint, u32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32, int, s32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64, uint64, u64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64, int64, s64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, suffix, vl) \
+inline v_##_Tpv v_setzero_##suffix() \
+{ \
+    return __riscv_vfmv_v_f_##suffix##m2(0, vl); \
+} \
+inline v_##_Tpv v_setall_##suffix(_Tp v) \
+{ \
+    return __riscv_vfmv_v_f_##suffix##m2(v, vl); \
+} \
+template <> inline v_##_Tpv v_setzero_() \
+{ \
+    return v_setzero_##suffix(); \
+} \
+template <> inline v_##_Tpv v_setall_(_Tp v) \
+{ \
+    return v_setall_##suffix(v); \
+}
+
+OPENCV_HAL_IMPL_RVV_INIT_FP(float32, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INIT_FP(float64, double, f64, VTraits<v_float64>::vlanes())
+#endif
+
+//////////// Reinterpret ////////////
+#define OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(_Tpvec1, suffix1) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec1& v) \
+{ \
+    return v;\
+}
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint8, u8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint16, u16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint32, u32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(uint64, u64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int8, s8)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int16, s16)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int32, s32)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(int64, s64)
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NOTHING_REINTERPRET(float64, f64)
+#endif
+// TODO: can be simplified by using overloaded RV intrinsic
+#define OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return v_##_Tpvec1(__riscv_vreinterpret_v_##nsuffix2##m2_##nsuffix1##m2(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return v_##_Tpvec2(__riscv_vreinterpret_v_##nsuffix1##m2_##nsuffix2##m2(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, int8, u8, s8, u8, i8)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, int16, u16, s16, u16, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, int32, u32, s32, u32, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, float32, u32, f32, u32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, float32, s32, f32, i32, f32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, int64, u64, s64, u64, i64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint64, float64, u64, f64, u64, f64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int64, float64, s64, f64, i64, f64)
+#endif
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint16, u8, u16, u8, u16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint32, u8, u32, u8, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint8, uint64, u8, u64, u8, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint32, u16, u32, u16, u32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint16, uint64, u16, u64, u16, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(uint32, uint64, u32, u64, u32, u64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int16, s8, s16, i8, i16)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int32, s8, s32, i8, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int8, int64, s8, s64, i8, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int32, s16, s32, i16, i32)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int16, int64, s16, s64, i16, i64)
+OPENCV_HAL_IMPL_RVV_NATIVE_REINTERPRET(int32, int64, s32, s64, i32, i64)
+
+
+#define OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(_Tpvec1, _Tpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
+inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
+{ \
+    return __riscv_vreinterpret_v_##nsuffix1##width2##m2_##nsuffix1##width1##m2(__riscv_vreinterpret_v_##nsuffix2##width2##m2_##nsuffix1##width2##m2(v));\
+} \
+inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
+{ \
+    return __riscv_vreinterpret_v_##nsuffix1##width2##m2_##nsuffix2##width2##m2(__riscv_vreinterpret_v_##nsuffix1##width1##m2_##nsuffix1##width2##m2(v));\
+}
+
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int16, u8, s16, u, i, 8, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int32, u8, s32, u, i, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, int64, u8, s64, u, i, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int8, u16, s8, u, i, 16, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int32, u16, s32, u, i, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, int64, u16, s64, u, i, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int8, u32, s8, u, i, 32, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int16, u32, s16, u, i, 32, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, int64, u32, s64, u, i, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int8, u64, s8, u, i, 64, 8)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int16, u64, s16, u, i, 64, 16)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, int32, u64, s32, u, i, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float32, u8, f32, u, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float32, u16, f32, u, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint64, float32, u64, f32, u, f, 64, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float32, s8, f32, i, f, 8, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float32, s16, f32, i, f, 16, 32)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int64, float32, s64, f32, i, f, 64, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint8, float64, u8, f64, u, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint16, float64, u16, f64, u, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(uint32, float64, u32, f64, u, f, 32, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int8, float64, s8, f64, i, f, 8, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int16, float64, s16, f64, i, f, 16, 64)
+OPENCV_HAL_IMPL_RVV_TWO_TIMES_REINTERPRET(int32, float64, s32, f64, i, f, 32, 64)
+// Three times reinterpret
+inline v_float32 v_reinterpret_as_f32(const v_float64& v) \
+{ \
+    return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vreinterpret_v_u64m2_u32m2(__riscv_vreinterpret_v_f64m2_u64m2(v)));\
+}
+
+inline v_float64 v_reinterpret_as_f64(const v_float32& v) \
+{ \
+    return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vreinterpret_v_u32m2_u64m2(__riscv_vreinterpret_v_f32m2_u32m2(v)));\
+}
+#endif
+
+//////////// Extract //////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(_Tpvec, _Tp, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return __riscv_vmv_x(__riscv_vslidedown(v, i, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint8, uchar, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int8, schar, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint16, ushort, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int16, short, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int32, int, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_uint64, uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT_INTEGER(v_int64, int64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT_FP(_Tpvec, _Tp, vl) \
+template <int s = 0> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b, int i = s) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(a, i, vl), b, VTraits<_Tpvec>::vlanes() - i, vl); \
+} \
+template<int s = 0> inline _Tp v_extract_n(_Tpvec v, int i = s) \
+{ \
+    return __riscv_vfmv_f(__riscv_vslidedown(v, i, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float32, float, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT_FP(v_float64, double, VTraits<v_float64>::vlanes())
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, vl) \
+inline _Tp v_extract_highest(_Tpvec v) \
+{ \
+    return v_extract_n(v, vl-1); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8, uchar, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8, schar, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16, ushort, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16, short, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32, unsigned int, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32, int, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64, uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64, int64, VTraits<v_int64>::vlanes())
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32, float, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64, double, VTraits<v_float64>::vlanes())
+#endif
+
+
+////////////// Load/Store //////////////
+#define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, vl, width, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m2(ptr, vl); \
+} \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m2(ptr, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    __riscv_vse##width##_v_##suffix##m2(ptr, a, vl); \
+} \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    return __riscv_vle##width##_v_##suffix##m2(ptr, hvl); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return __riscv_vslideup(__riscv_vle##width##_v_##suffix##m2(ptr0, hvl), __riscv_vle##width##_v_##suffix##m2(ptr1, hvl), hvl, vl); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width(ptr, a, vl); \
+} \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width(ptr, a, vl); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width(ptr, a, hvl); \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __riscv_vse##width(ptr, __riscv_vslidedown_vx_##suffix##m2(a, hvl, vl), hvl); \
+} \
+template<typename... Targs> \
+_Tpvec v_load_##suffix(Targs... nScalars) \
+{ \
+    return v_load({nScalars...}); \
+}
+
+
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8, vuint8m2_t, uchar, VTraits<v_uint8>::vlanes() / 2, VTraits<v_uint8>::vlanes(), 8, u8)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8, vint8m2_t, schar, VTraits<v_int8>::vlanes() / 2, VTraits<v_int8>::vlanes(), 8, i8)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16, vuint16m2_t, ushort, VTraits<v_uint16>::vlanes() / 2, VTraits<v_uint16>::vlanes(), 16, u16)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16, vint16m2_t, short, VTraits<v_int16>::vlanes() / 2, VTraits<v_int16>::vlanes(), 16, i16)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32, vuint32m2_t, unsigned int, VTraits<v_uint32>::vlanes() / 2, VTraits<v_uint32>::vlanes(), 32, u32)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32, vint32m2_t, int, VTraits<v_int32>::vlanes() / 2, VTraits<v_int32>::vlanes(), 32, i32)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64, vuint64m2_t, uint64, VTraits<v_uint64>::vlanes() / 2, VTraits<v_uint64>::vlanes(), 64, u64)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64, vint64m2_t, int64, VTraits<v_int64>::vlanes() / 2, VTraits<v_int64>::vlanes(), 64, i64)
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32, vfloat32m2_t, float, VTraits<v_float32>::vlanes() /2 , VTraits<v_float32>::vlanes(), 32, f32)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64, vfloat64m2_t, double, VTraits<v_float64>::vlanes() / 2, VTraits<v_float64>::vlanes(), 64, f64)
+#endif
+
+////////////// Lookup table access ////////////////////
+#define OPENCV_HAL_IMPL_RVV_LUT(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_lut(const _Tp* tab, const int* idx) \
+{ \
+    auto vidx = __riscv_vmul(__riscv_vreinterpret_u32##suffix(__riscv_vle32_v_i32##suffix(idx, VTraits<_Tpvec>::vlanes())), sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT(v_int8, schar, m8)
+OPENCV_HAL_IMPL_RVV_LUT(v_int16, short, m4)
+OPENCV_HAL_IMPL_RVV_LUT(v_int32, int, m2)
+OPENCV_HAL_IMPL_RVV_LUT(v_int64, int64_t, m1)
+OPENCV_HAL_IMPL_RVV_LUT(v_float32, float, m2)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT(v_float64, double, m1)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_LUT_PAIRS(_Tpvec, _Tp, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_pairs(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = __riscv_vle32_v_u32##suffix1((unsigned*)idx, VTraits<_Tpvec>::vlanes()/2); \
+    auto v1 = __riscv_vadd(v0, 1, VTraits<_Tpvec>::vlanes()/2); \
+    auto w0 = __riscv_vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/2); \
+    auto w1 = __riscv_vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh1 = __riscv_vslide1up(v_trunc(__riscv_vreinterpret_u32##suffix2(w1)),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = __riscv_vor(sh1, v_trunc(__riscv_vreinterpret_u32##suffix2(w0)), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int8, schar, m4, m8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int16, short, m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int32, int, m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float32, float, m1, m2, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_int64, int64_t, m1, m2, __riscv_vlmul_trunc_u32m1)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_LUT_PAIRS(v_float64, double, m1, m2, __riscv_vlmul_trunc_u32m1)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_LUT_QUADS(_Tpvec, _Tp, suffix0, suffix1, suffix2, v_trunc) \
+inline _Tpvec v_lut_quads(const _Tp* tab, const int* idx) \
+{ \
+    auto v0 = __riscv_vle32_v_u32##suffix0((unsigned*)idx, VTraits<_Tpvec>::vlanes()/4); \
+    auto v1 = __riscv_vadd(v0, 1, VTraits<_Tpvec>::vlanes()/4); \
+    auto v2 = __riscv_vadd(v0, 2, VTraits<_Tpvec>::vlanes()/4); \
+    auto v3 = __riscv_vadd(v0, 3, VTraits<_Tpvec>::vlanes()/4); \
+    auto w0 = __riscv_vwcvtu_x(v0, VTraits<_Tpvec>::vlanes()/4); \
+    auto w1 = __riscv_vwcvtu_x(v1, VTraits<_Tpvec>::vlanes()/4); \
+    auto w2 = __riscv_vwcvtu_x(v2, VTraits<_Tpvec>::vlanes()/4); \
+    auto w3 = __riscv_vwcvtu_x(v3, VTraits<_Tpvec>::vlanes()/4); \
+    auto sh2 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix1(w2),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto sh3 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix1(w3),0, VTraits<_Tpvec>::vlanes()/2); \
+    auto vid0 = __riscv_vor(sh2, __riscv_vreinterpret_u32##suffix1(w0), VTraits<_Tpvec>::vlanes()/2); \
+    auto vid1 = __riscv_vor(sh3, __riscv_vreinterpret_u32##suffix1(w1), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid0 = __riscv_vwcvtu_x(v_trunc(vid0), VTraits<_Tpvec>::vlanes()/2); \
+    auto wid1 = __riscv_vwcvtu_x(v_trunc(vid1), VTraits<_Tpvec>::vlanes()/2); \
+    auto shwid1 = __riscv_vslide1up(__riscv_vreinterpret_u32##suffix2(wid1),0, VTraits<_Tpvec>::vlanes()); \
+    auto vid = __riscv_vor(shwid1, __riscv_vreinterpret_u32##suffix2(wid0), VTraits<_Tpvec>::vlanes()); \
+    auto vidx = __riscv_vmul(vid, sizeof(_Tp), VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vloxei32(tab, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int8, schar, m2, m4, m8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int16, short, m1 , m2, m4, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_int32, int, m1, m2, m2, __riscv_vlmul_trunc_u32m1)
+OPENCV_HAL_IMPL_RVV_LUT_QUADS(v_float32, float, m1, m2, m2, __riscv_vlmul_trunc_u32m1)
+
+#define OPENCV_HAL_IMPL_RVV_LUT_VEC(_Tpvec, _Tp) \
+inline _Tpvec v_lut(const _Tp* tab, const v_int32& vidx) \
+{ \
+    v_uint32 vidx_ = __riscv_vmul(__riscv_vreinterpret_u32m2(vidx), sizeof(_Tp), VTraits<v_int32>::vlanes()); \
+    return __riscv_vloxei32(tab, vidx_, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_float32, float)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_int32, int)
+OPENCV_HAL_IMPL_RVV_LUT_VEC(v_uint32, unsigned)
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_lut(const double* tab, const v_int32& vidx) \
+{ \
+    vuint32m1_t vidx_ = __riscv_vmul(__riscv_vlmul_trunc_u32m1(__riscv_vreinterpret_u32m2(vidx)), sizeof(double), VTraits<v_float64>::vlanes()); \
+    return __riscv_vloxei32(tab, vidx_, VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+
+inline v_uint8 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+inline v_uint16 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+inline v_uint32 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+inline v_uint64 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+////////////// Pack boolean ////////////////////
+inline v_uint8 v_pack_b(const v_uint16& a, const v_uint16& b)
+{
+    return __riscv_vnsrl(__riscv_vset(__riscv_vlmul_ext_v_u16m2_u16m4(a),1,b), 0, VTraits<v_uint8>::vlanes());
+}
+
+inline v_uint8 v_pack_b(const v_uint32& a, const v_uint32& b,
+                           const v_uint32& c, const v_uint32& d)
+{
+
+    return __riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u32m8(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
+}
+
+inline v_uint8 v_pack_b(const v_uint64& a, const v_uint64& b, const v_uint64& c,
+                           const v_uint64& d, const v_uint64& e, const v_uint64& f,
+                           const v_uint64& g, const v_uint64& h)
+{
+    vuint8m1_t t0 = __riscv_vnsrl(__riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u64m8(a),1,b),2,c),3,d), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
+    vuint8m1_t t1 = __riscv_vnsrl(__riscv_vnsrl(__riscv_vnsrl(__riscv_vset(__riscv_vset(__riscv_vset(__riscv_vlmul_ext_u64m8(e),1,f),2,g),3,h), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes()), 0, VTraits<v_uint8>::vlanes());
+
+    return __riscv_vset(__riscv_vlmul_ext_u8m2(t0), 1, t1);
+}
+
+////////////// Arithmetics //////////////
+#define OPENCV_HAL_IMPL_RVV_BIN_OP(_Tpvec, ocv_intrin, rvv_intrin) \
+inline _Tpvec v_##ocv_intrin(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return rvv_intrin(a, b, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add, __riscv_vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub, __riscv_vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add, __riscv_vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub, __riscv_vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add, __riscv_vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub, __riscv_vssubu)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add, __riscv_vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub, __riscv_vssub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, add, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, sub, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint32, mul, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, add, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, sub, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int32, mul, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, add, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, sub, __riscv_vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, mul, __riscv_vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float32, div, __riscv_vfdiv)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, add, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint64, sub, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, add, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int64, sub, __riscv_vsub)
+
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, add, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, sub, __riscv_vfsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, mul, __riscv_vfmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_float64, div, __riscv_vfdiv)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_BIN_MADD(_Tpvec, rvv_add) \
+template<typename... Args> \
+inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_add(rvv_add(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+#define OPENCV_HAL_IMPL_RVV_BIN_MMUL(_Tpvec, rvv_mul) \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const Args&... vf) { \
+    return v_mul(rvv_mul(f1, f2, VTraits<_Tpvec>::vlanes()), vf...); \
+}
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint8, __riscv_vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int8, __riscv_vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint16, __riscv_vsaddu)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int16, __riscv_vsadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint32, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int32, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float32, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_uint64, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_int64, __riscv_vadd)
+
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_uint32, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_int32, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float32, __riscv_vfmul)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_MADD(v_float64, __riscv_vfadd)
+OPENCV_HAL_IMPL_RVV_BIN_MMUL(v_float64, __riscv_vfmul)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _TpwvecM2, suffix, wmul) \
+inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
+{ \
+    _TpwvecM2 temp = wmul(a, b, VTraits<_Tpvec>::vlanes()); \
+    c = __riscv_vget_##suffix##m2(temp, 0); \
+    d = __riscv_vget_##suffix##m2(temp, 1); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8, v_uint16, vuint16m4_t, u16, __riscv_vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8, v_int16, vint16m4_t, i16, __riscv_vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16, v_uint32, vuint32m4_t, u32, __riscv_vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16, v_int32, vint32m4_t, i32, __riscv_vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32, v_uint64, vuint64m4_t, u64, __riscv_vwmulu)
+
+inline v_int16 v_mul_hi(const v_int16& a, const v_int16& b)
+{
+    return __riscv_vmulh(a, b, VTraits<v_int16>::vlanes());
+}
+inline v_uint16 v_mul_hi(const v_uint16& a, const v_uint16& b)
+{
+    return __riscv_vmulhu(a, b, VTraits<v_uint16>::vlanes());
+}
+
+////////////// Arithmetics (wrap)//////////////
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, add_wrap, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, add_wrap, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, add_wrap, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, add_wrap, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, sub_wrap, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, sub_wrap, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, sub_wrap, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, sub_wrap, __riscv_vsub)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint8, mul_wrap, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int8, mul_wrap, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_uint16, mul_wrap, __riscv_vmul)
+OPENCV_HAL_IMPL_RVV_BIN_OP(v_int16, mul_wrap, __riscv_vmul)
+
+//////// Saturating Multiply ////////
+#define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _clip, _wmul) \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _clip(_wmul(a, b, VTraits<_Tpvec>::vlanes()), 0, 0, VTraits<_Tpvec>::vlanes()); \
+} \
+template<typename... Args> \
+inline _Tpvec v_mul(const _Tpvec& a1, const _Tpvec& a2, const Args&... va) { \
+    return v_mul(_clip(_wmul(a1, a2, VTraits<_Tpvec>::vlanes()), 0, 0, VTraits<_Tpvec>::vlanes()), va...); \
+}
+
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8, __riscv_vnclipu, __riscv_vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8, __riscv_vnclip, __riscv_vwmul)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16, __riscv_vnclipu, __riscv_vwmulu)
+OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16, __riscv_vnclip, __riscv_vwmul)
+
+////////////// Bitwise logic //////////////
+
+#define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, vl) \
+inline _Tpvec v_and(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vand(a, b, vl); \
+} \
+inline _Tpvec v_or(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vor(a, b, vl); \
+} \
+inline _Tpvec v_xor(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vxor(a, b, vl); \
+} \
+inline _Tpvec v_not (const _Tpvec& a) \
+{ \
+    return __riscv_vnot(a, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(intrin) \
+inline v_float32 intrin (const v_float32& a, const v_float32& b) \
+{ \
+    return __riscv_vreinterpret_f32m2(intrin(__riscv_vreinterpret_i32m2(a), __riscv_vreinterpret_i32m2(b))); \
+}
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(v_xor)
+
+inline v_float32 v_not (const v_float32& a) \
+{ \
+    return __riscv_vreinterpret_f32m2(v_not(__riscv_vreinterpret_i32m2(a))); \
+}
+
+#if CV_SIMD_SCALABLE_64F
+#define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(intrin) \
+inline v_float64 intrin (const v_float64& a, const v_float64& b) \
+{ \
+    return __riscv_vreinterpret_f64m2(intrin(__riscv_vreinterpret_i64m2(a), __riscv_vreinterpret_i64m2(b))); \
+}
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_and)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_or)
+OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(v_xor)
+
+inline v_float64 v_not (const v_float64& a) \
+{ \
+    return __riscv_vreinterpret_f64m2(v_not(__riscv_vreinterpret_i64m2(a))); \
+}
+#endif
+
+
+////////////// Bitwise shifts //////////////
+/*  Usage
+1. v_shl<N>(vec);
+2. v_shl(vec, N); // instead of vec << N, when N is non-constant.
+*/
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(__riscv_vsll(a, uint8_t(n), vl)); \
+} \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(__riscv_vsrl(a, uint8_t(n), vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, vl) \
+template<int s = 0> inline _Tpvec v_shl(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(__riscv_vsll(a, uint8_t(n), vl)); \
+} \
+template<int s = 0> inline _Tpvec v_shr(const _Tpvec& a, int n = s) \
+{ \
+    return _Tpvec(__riscv_vsra(a, uint8_t(n), vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64, VTraits<v_int64>::vlanes())
+
+////////////// Comparison //////////////
+#define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix) \
+inline _Tpvec v_##op(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    uint64_t ones = -1; \
+    return __riscv_vmerge(__riscv_vmv_v_x_##suffix##m2(0, VLEN), ones, intrin(a, b, VLEN), VLEN); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix) \
+inline _Tpvec v_##op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    size_t VLEN = VTraits<_Tpvec>::vlanes(); \
+    union { uint64_t u; VTraits<_Tpvec>::lane_type d; } ones; \
+    ones.u = -1; \
+    auto diff = intrin(a, b, VLEN); \
+    auto z = __riscv_vfmv_v_f_##suffix##m2(0, VLEN); \
+    auto res = __riscv_vfmerge(z, ones.d, diff, VLEN); \
+    return _Tpvec(res); \
+} //TODO
+
+#define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, __riscv_vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, __riscv_vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, __riscv_vmsltu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, __riscv_vmsgtu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, __riscv_vmsleu, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, __riscv_vmsgeu, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, eq, __riscv_vmseq, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ne, __riscv_vmsne, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, lt, __riscv_vmslt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, gt, __riscv_vmsgt, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, le, __riscv_vmsle, suffix) \
+OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ge, __riscv_vmsge, suffix)
+
+#define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, eq, __riscv_vmfeq, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ne, __riscv_vmfne, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, lt, __riscv_vmflt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, gt, __riscv_vmfgt, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, le, __riscv_vmfle, suffix) \
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ge, __riscv_vmfge, suffix)
+
+
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8, u8)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16, u16)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64, u64)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8, i8)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16, i16)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64, i64)
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64, f64)
+#endif
+
+inline v_float32 v_not_nan(const v_float32& a)
+{ return v_eq(a, a); }
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_not_nan(const v_float64& a)
+{ return v_eq(a, a); }
+#endif
+
+////////////// Min/Max //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, vl) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return intrin(a, b, vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_min, __riscv_vminu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8, v_max, __riscv_vmaxu, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_min, __riscv_vmin, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8, v_max, __riscv_vmax, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_min, __riscv_vminu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16, v_max, __riscv_vmaxu, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_min, __riscv_vmin, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16, v_max, __riscv_vmax, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_min, __riscv_vminu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32, v_max, __riscv_vmaxu, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_min, __riscv_vmin, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32, v_max, __riscv_vmax, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_min, __riscv_vfmin, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32, v_max, __riscv_vfmax, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_min, __riscv_vfmin, VTraits<v_float64>::vlanes())
+OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64, v_max, __riscv_vfmax, VTraits<v_float64>::vlanes())
+#endif
+
+////////////// Transpose4x4 //////////////
+#define OPENCV_HAL_IMPL_RVV_ZIP4(_Tpvec, _wTpvec, suffix, convert2u, convert) \
+inline void v_zip4(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    int vl = 4; \
+    _wTpvec temp = __riscv_vreinterpret_##suffix##m4(convert2u( \
+        __riscv_vor(__riscv_vzext_vf2(convert(a0), vl), \
+            __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(convert(a1), vl)), 0, vl*2)), \
+            vl))); \
+    b0 = __riscv_vget_##suffix##m2(temp, 0); \
+    b1 = __riscv_vget_##suffix##m2(__riscv_vrgather(temp, __riscv_vadd(__riscv_vid_v_u32m4(vl), 4, vl)/*{4,5,6,7} */, vl) ,0); \
+}
+
+OPENCV_HAL_IMPL_RVV_ZIP4(v_uint32, vuint32m4_t, u32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_int32, vint32m4_t, i32, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2)
+OPENCV_HAL_IMPL_RVV_ZIP4(v_float32, vfloat32m4_t, f32, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2)
+
+
+#define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, const _Tpvec& a2, const _Tpvec& a3, _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) { \
+    _Tpvec t0,t1,t2,t3; \
+    v_zip4(a0, a2, t0, t2); \
+    v_zip4(a1, a3, t1, t3); \
+    v_zip4(t0, t1, b0, b1); \
+    v_zip4(t2, t3, b2, b3); \
+}
+
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_int32, i32)
+OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(v_float32, f32)
+
+////////////// Reduce //////////////
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl, red) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = __riscv_vmv_v_x_##wsuffix##m1(0, vl); \
+    _nwTpvec res = __riscv_vmv_v_x_##wsuffix##m1(0, vl); \
+    res = __riscv_v##red(a, zero, vl); \
+    return (scalartype)__riscv_vmv_x(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8, v_uint16, vuint16m1_t, unsigned, u16, VTraits<v_uint8>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8, v_int16, vint16m1_t, int, i16, VTraits<v_int8>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16, v_uint32, vuint32m1_t, unsigned, u32, VTraits<v_uint16>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16, v_int32, vint32m1_t, int, i32, VTraits<v_int16>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32, v_uint64, vuint64m1_t, unsigned, u64, VTraits<v_uint32>::vlanes(), wredsumu)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32, v_int64, vint64m1_t, int, i64, VTraits<v_int32>::vlanes(), wredsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64, v_uint64, vuint64m1_t, uint64, u64, VTraits<v_uint64>::vlanes(), redsum)
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64, v_int64, vint64m1_t, int64, i64, VTraits<v_int64>::vlanes(), redsum)
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(_Tpvec, _wTpvec, _nwTpvec, scalartype, wsuffix, vl) \
+inline scalartype v_reduce_sum(const _Tpvec& a)  \
+{ \
+    _nwTpvec zero = __riscv_vfmv_v_f_##wsuffix##m1(0, vl); \
+    _nwTpvec res = __riscv_vfmv_v_f_##wsuffix##m1(0, vl); \
+    res = __riscv_vfredusum(a, zero, vl); \
+    return (scalartype)__riscv_vfmv_f(res); \
+}
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float32, v_float32, vfloat32m1_t, float, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REDUCE_SUM_FP(v_float64, v_float64, vfloat64m1_t, float, f64, VTraits<v_float64>::vlanes())
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, _nTpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _nTpvec narrowM1 = __riscv_vlmul_trunc_##suffix##m1(a); \
+    return (scalartype)__riscv_vmv_x(__riscv_v##red(a, narrowM1, vl)); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_FP(_Tpvec, _nTpvec, func, scalartype, suffix, vl, red) \
+inline scalartype v_reduce_##func(const _Tpvec& a)  \
+{ \
+    _nTpvec narrowM1 = __riscv_vlmul_trunc_##suffix##m1(a); \
+    return (scalartype)__riscv_vfmv_f(__riscv_v##red(a, narrowM1, vl)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8, vuint8m1_t, min, uchar, u8, VTraits<v_uint8>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8, vint8m1_t, min, schar, i8, VTraits<v_int8>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16, vuint16m1_t, min, ushort, u16, VTraits<v_uint16>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16, vint16m1_t, min, short, i16, VTraits<v_int16>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32, vuint32m1_t, min, unsigned, u32, VTraits<v_uint32>::vlanes(), redminu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32, vint32m1_t, min, int, i32, VTraits<v_int32>::vlanes(), redmin)
+OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float32, vfloat32m1_t, min, float, f32, VTraits<v_float32>::vlanes(), fredmin)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8,  vuint8m1_t, max, uchar, u8, VTraits<v_uint8>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int8,  vint8m1_t, max, schar, i8, VTraits<v_int8>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16,  vuint16m1_t, max, ushort, u16, VTraits<v_uint16>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int16,  vint16m1_t, max, short, i16, VTraits<v_int16>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32,  vuint32m1_t, max, unsigned, u32, VTraits<v_uint32>::vlanes(), redmaxu)
+OPENCV_HAL_IMPL_RVV_REDUCE(v_int32,  vint32m1_t, max, int, i32, VTraits<v_int32>::vlanes(), redmax)
+OPENCV_HAL_IMPL_RVV_REDUCE_FP(v_float32,  vfloat32m1_t, max, float, f32, VTraits<v_float32>::vlanes(), fredmax)
+
+inline v_float32 v_reduce_sum4(const v_float32& a, const v_float32& b,
+                                 const v_float32& c, const v_float32& d)
+{
+    // 0000 1111 2222 3333 ....
+    vuint64m4_t vid1 = __riscv_vid_v_u64m4(VTraits<vuint32m2_t>::vlanes());
+    vuint16m4_t t1 = __riscv_vreinterpret_u16m4(vid1);
+    vuint16m4_t t2 = __riscv_vslide1up(t1, 0, VTraits<vuint8m2_t>::vlanes());
+    vuint16m4_t t3 = __riscv_vslide1up(t2, 0, VTraits<vuint8m2_t>::vlanes());
+    vuint16m4_t t4 = __riscv_vslide1up(t3, 0, VTraits<vuint8m2_t>::vlanes());
+    t1 = __riscv_vor(
+        __riscv_vor(t1, t2, VTraits<vuint8m2_t>::vlanes()),
+        __riscv_vor(t3, t4, VTraits<vuint8m2_t>::vlanes()),
+        VTraits<vuint8m2_t>::vlanes()
+    );
+
+    // index for transpose4X4
+    vuint16m4_t vidx0 = __riscv_vmul(t1, 12, VTraits<vuint8m2_t>::vlanes());
+    vidx0 = __riscv_vadd(vidx0, __riscv_vid_v_u16m4(VTraits<vuint8m2_t>::vlanes()), VTraits<vuint8m2_t>::vlanes());
+    vuint16m4_t vidx1 = __riscv_vadd(vidx0, 4, VTraits<vuint8m2_t>::vlanes());
+    vuint16m4_t vidx2 = __riscv_vadd(vidx0, 8, VTraits<vuint8m2_t>::vlanes());
+    vuint16m4_t vidx3 = __riscv_vadd(vidx0, 12, VTraits<vuint8m2_t>::vlanes());
+
+    // zip
+    vuint32m4_t tempA = __riscv_vreinterpret_u32m4( \
+        __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(a), VTraits<vuint16m2_t>::vlanes()), \
+            __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(c), VTraits<vuint16m2_t>::vlanes())), 0, VTraits<vuint16m2_t>::vlanes())), \
+            VTraits<vuint32m2_t>::vlanes())); \
+    vuint32m4_t tempB = __riscv_vreinterpret_u32m4( \
+        __riscv_vor(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(b), VTraits<vuint16m2_t>::vlanes()), \
+            __riscv_vreinterpret_u64m4(__riscv_vslide1up(__riscv_vreinterpret_u32m4(__riscv_vzext_vf2(__riscv_vreinterpret_u32m2(d), VTraits<vuint16m2_t>::vlanes())), 0, VTraits<vuint16m2_t>::vlanes())), \
+            VTraits<vuint32m2_t>::vlanes())); \
+    vfloat32m8_t temp = __riscv_vreinterpret_f32m8(__riscv_vreinterpret_u32m8( \
+        __riscv_vor(__riscv_vzext_vf2(tempA, VTraits<vuint8m2_t>::vlanes()), \
+            __riscv_vreinterpret_u64m8(__riscv_vslide1up(__riscv_vreinterpret_u32m8(__riscv_vzext_vf2(tempB, VTraits<vuint8m2_t>::vlanes())), 0, VTraits<vuint8m2_t>::vlanes())), \
+            VTraits<vuint16m2_t>::vlanes())));
+
+    // transpose
+    vfloat32m2_t b0 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx0, VTraits<vuint8m2_t>::vlanes()));
+    vfloat32m2_t b1 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx1, VTraits<vuint8m2_t>::vlanes()));
+    vfloat32m2_t b2 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx2, VTraits<vuint8m2_t>::vlanes()));
+    vfloat32m2_t b3 = __riscv_vlmul_trunc_f32m2(__riscv_vrgatherei16(temp, vidx3, VTraits<vuint8m2_t>::vlanes()));
+
+    // vector add
+    v_float32 res = __riscv_vfadd(
+        __riscv_vfadd(b0, b1, VTraits<vfloat32m2_t>::vlanes()),
+        __riscv_vfadd(b2, b3, VTraits<vfloat32m2_t>::vlanes()),
+        VTraits<vfloat32m2_t>::vlanes()
+    );
+    return res;
+}
+
+////////////// Square-Root //////////////
+
+inline v_float32 v_sqrt(const v_float32& x)
+{
+    return __riscv_vfsqrt(x, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_invsqrt(const v_float32& x)
+{
+    v_float32 one = v_setall_f32(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_sqrt(const v_float64& x)
+{
+    return __riscv_vfsqrt(x, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_invsqrt(const v_float64& x)
+{
+    v_float64 one = v_setall_f64(1.0f);
+    return v_div(one, v_sqrt(x));
+}
+#endif
+
+inline v_float32 v_magnitude(const v_float32& a, const v_float32& b)
+{
+    v_float32 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float32 v_sqr_magnitude(const v_float32& a, const v_float32& b)
+{
+    return v_float32(__riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float32>::vlanes()), b, b, VTraits<v_float32>::vlanes()));
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_magnitude(const v_float64& a, const v_float64& b)
+{
+    v_float64 x = __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+    return v_sqrt(x);
+}
+
+inline v_float64 v_sqr_magnitude(const v_float64& a, const v_float64& b)
+{
+    return __riscv_vfmacc(__riscv_vfmul(a, a, VTraits<v_float64>::vlanes()), b, b, VTraits<v_float64>::vlanes());
+}
+#endif
+
+////////////// Multiply-Add //////////////
+
+inline v_float32 v_fma(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return __riscv_vfmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+inline v_int32 v_fma(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return __riscv_vmacc(c, a, b, VTraits<v_float32>::vlanes());
+}
+
+inline v_float32 v_muladd(const v_float32& a, const v_float32& b, const v_float32& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32 v_muladd(const v_int32& a, const v_int32& b, const v_int32& c)
+{
+    return v_fma(a, b, c);
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_fma(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return __riscv_vfmacc_vv_f64m2(c, a, b, VTraits<v_float64>::vlanes());
+}
+
+inline v_float64 v_muladd(const v_float64& a, const v_float64& b, const v_float64& c)
+{
+    return v_fma(a, b, c);
+}
+#endif
+
+////////////// Check all/any //////////////
+
+#define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, vl) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    return (int)__riscv_vcpop(__riscv_vmslt(a, 0, vl), vl) == vl; \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    return (int)__riscv_vcpop(__riscv_vmslt(a, 0, vl), vl) != 0; \
+}
+
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_int64, VTraits<v_int64>::vlanes())
+
+
+inline bool v_check_all(const v_uint8& a)
+{ return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint8& a)
+{ return v_check_any(v_reinterpret_as_s8(a)); }
+
+inline bool v_check_all(const v_uint16& a)
+{ return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint16& a)
+{ return v_check_any(v_reinterpret_as_s16(a)); }
+
+inline bool v_check_all(const v_uint32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_float32& a)
+{ return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32& a)
+{ return v_check_any(v_reinterpret_as_s32(a)); }
+
+inline bool v_check_all(const v_uint64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+
+#if CV_SIMD_SCALABLE_64F
+inline bool v_check_all(const v_float64& a)
+{ return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64& a)
+{ return v_check_any(v_reinterpret_as_s64(a)); }
+#endif
+
+////////////// abs //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
+inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_sub(v_max(a, b), v_min(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32, absdiff)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32, absdiff)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64, absdiff)
+#endif
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8, absdiffs)
+OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16, absdiffs)
+
+#define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, width) \
+inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vnclipu(__riscv_vreinterpret_u##width##m4(__riscv_vwsub_vv(v_max(a, b), v_min(a, b), VTraits<_Tpvec>::vlanes())), 0, 0, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8, v_uint8, 16)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16, v_uint16, 32)
+OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32, v_uint32, 64)
+
+#define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
+inline _Tprvec v_abs(const _Tpvec& a) \
+{ \
+    return v_absdiff(a, v_setzero_##suffix()); \
+}
+
+OPENCV_HAL_IMPL_RVV_ABS(v_uint8, v_int8, s8)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint16, v_int16, s16)
+OPENCV_HAL_IMPL_RVV_ABS(v_uint32, v_int32, s32)
+OPENCV_HAL_IMPL_RVV_ABS(v_float32, v_float32, f32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ABS(v_float64, v_float64, f64)
+#endif
+
+
+#define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
+inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_reduce_sum(v_absdiff(a, b)); \
+}
+
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32, unsigned)
+OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32, float)
+
+////////////// Select //////////////
+
+#define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, vl) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vmerge(b, a, __riscv_vmsne(mask, 0, vl), vl); \
+}
+
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_uint32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int16, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_SELECT(v_int32, VTraits<v_int32>::vlanes())
+
+inline v_float32 v_select(const v_float32& mask, const v_float32& a, const v_float32& b) \
+{ \
+    return __riscv_vmerge(b, a, __riscv_vmfne(mask, 0, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes()); \
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_select(const v_float64& mask, const v_float64& a, const v_float64& b) \
+{ \
+    return __riscv_vmerge(b, a, __riscv_vmfne(mask, 0, VTraits<v_float64>::vlanes()), VTraits<v_float64>::vlanes()); \
+}
+#endif
+
+////////////// Rotate shift //////////////
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return __riscv_vslidedown(a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return __riscv_vslideup(__riscv_vmv_v_x_##suffix##m2(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint8, u8, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int8, i8, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint16, u16, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int16, i16,  VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint32, u32, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int32, i32, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_uint64, u64, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_ROTATE_INTEGER(v_int64, i64, VTraits<v_int64>::vlanes())
+
+#define OPENCV_HAL_IMPL_RVV_ROTATE_FP(_Tpvec, suffix, vl) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return __riscv_vslidedown(a, n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return __riscv_vslideup(__riscv_vfmv_v_f_##suffix##m2(0, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ return a; } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(a, n, vl), b, VTraits<_Tpvec>::vlanes() - n, vl); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup(__riscv_vslidedown(b, VTraits<_Tpvec>::vlanes() - n, vl), a, n, vl); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ CV_UNUSED(b); return a; }
+
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float32, f32, VTraits<v_float32>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_ROTATE_FP(v_float64, f64,  VTraits<v_float64>::vlanes())
+#endif
+
+////////////// Convert to float //////////////
+inline v_float32 v_cvt_f32(const v_int32& a)
+{
+    return __riscv_vfcvt_f_x_v_f32m2(a, VTraits<v_float32>::vlanes());
+}
+
+#if CV_SIMD_SCALABLE_64F
+inline v_float32 v_cvt_f32(const v_float64& a)
+{
+    return __riscv_vfncvt_f(__riscv_vlmul_ext_f64m4(a), VTraits<v_float64>::vlanes());
+}
+
+inline v_float32 v_cvt_f32(const v_float64& a, const v_float64& b)
+{
+    return __riscv_vfncvt_f(__riscv_vset(__riscv_vlmul_ext_f64m4(a),1,b), VTraits<v_float32>::vlanes());
+}
+
+inline v_float64 v_cvt_f64(const v_int32& a)
+{
+    return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits<v_int32>::vlanes()), 0);
+}
+
+inline v_float64 v_cvt_f64_high(const v_int32& a)
+{
+    return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits<v_int32>::vlanes()), 1);
+}
+
+inline v_float64 v_cvt_f64(const v_float32& a)
+{
+    return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits<v_float32>::vlanes()), 0);
+}
+
+inline v_float64 v_cvt_f64_high(const v_float32& a)
+{
+    return __riscv_vget_f64m2(__riscv_vfwcvt_f(a, VTraits<v_float32>::vlanes()), 1);
+}
+
+inline v_float64 v_cvt_f64(const v_int64& a)
+{
+    return __riscv_vfcvt_f(a, VTraits<v_int64>::vlanes());
+}
+#endif
+
+//////////// Broadcast //////////////
+
+#define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
+template<int s = 0> inline _Tpvec v_broadcast_element(_Tpvec v, int i = s) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, i)); \
+} \
+inline _Tpvec v_broadcast_highest(_Tpvec v) \
+{ \
+    return v_setall_##suffix(v_extract_n(v, VTraits<_Tpvec>::vlanes()-1)); \
+}
+
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32, u32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32, s32)
+OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32, f32)
+
+
+////////////// Reverse //////////////
+#define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, width) \
+inline _Tpvec v_reverse(const _Tpvec& a)  \
+{ \
+    vuint##width##m2_t vidx = __riscv_vrsub(__riscv_vid_v_u##width##m2(VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()-1, VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vrgather(a, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float32, 32)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64, 64)
+OPENCV_HAL_IMPL_RVV_REVERSE(v_int64, 64)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_REVERSE(v_float64, 64)
+#endif
+
+//////////// Value reordering ////////////
+
+#define OPENCV_HAL_IMPL_RVV_EXPAND(_Tp, _Tpwvec, _Tpwvec_m2, _Tpvec, width, suffix, suffix2, cvt) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    b0 = __riscv_vget_##suffix##m2(temp, 0); \
+    b1 = __riscv_vget_##suffix##m2(temp, 1); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vget_##suffix##m2(temp, 0); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpwvec_m2 temp = cvt(a, VTraits<_Tpvec>::vlanes()); \
+    return __riscv_vget_##suffix##m2(temp, 1); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return cvt(__riscv_vle##width##_v_##suffix2##m1(ptr, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_EXPAND(uchar, v_uint16, vuint16m4_t, v_uint8, 8, u16, u8, __riscv_vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(schar, v_int16, vint16m4_t, v_int8, 8, i16, i8, __riscv_vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(ushort, v_uint32, vuint32m4_t, v_uint16, 16, u32, u16, __riscv_vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(short, v_int32, vint32m4_t, v_int16, 16, i32, i16, __riscv_vwcvt_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(uint, v_uint64, vuint64m4_t, v_uint32, 32, u64, u32, __riscv_vwcvtu_x)
+OPENCV_HAL_IMPL_RVV_EXPAND(int, v_int64, vint64m4_t, v_int32, 32, i64, i32, __riscv_vwcvt_x)
+
+inline v_uint32 v_load_expand_q(const uchar* ptr)
+{
+    return __riscv_vwcvtu_x(__riscv_vwcvtu_x(__riscv_vle8_v_u8mf2(ptr, VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes()), VTraits<v_uint32>::vlanes());
+}
+
+inline v_int32 v_load_expand_q(const schar* ptr)
+{
+    return __riscv_vwcvt_x(__riscv_vwcvt_x(__riscv_vle8_v_i8mf2(ptr, VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes()), VTraits<v_int32>::vlanes());
+}
+
+#define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, 0, VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, shr(a, 0, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
+{ \
+    return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+}
+
+#define OPENCV_HAL_IMPL_RVV_PACK_32(_Tpvec, _Tp, _wTpvec, hwidth, hsuffix, suffix, rshr, shr) \
+inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    return shr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, shr(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+_Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b, int N = n) \
+{ \
+    return rshr(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), N, 0, VTraits<_Tpvec>::vlanes()); \
+} \
+template<int n = 0> inline \
+void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a, int N = n) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, rshr(a, N, 0, VTraits<_Tpvec>::vlanes()), VTraits<_wTpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK(v_uint8, uchar, v_uint16, 8, u8, u16, __riscv_vnclipu, __riscv_vnclipu)
+OPENCV_HAL_IMPL_RVV_PACK(v_int8, schar, v_int16, 8,  i8, i16, __riscv_vnclip, __riscv_vnclip)
+OPENCV_HAL_IMPL_RVV_PACK(v_uint16, ushort, v_uint32, 16, u16, u32, __riscv_vnclipu, __riscv_vnclipu)
+OPENCV_HAL_IMPL_RVV_PACK(v_int16, short, v_int32, 16, i16, i32, __riscv_vnclip, __riscv_vnclip)
+OPENCV_HAL_IMPL_RVV_PACK_32(v_uint32, unsigned, v_uint64, 32, u32, u64, __riscv_vnclipu, __riscv_vnsrl)
+OPENCV_HAL_IMPL_RVV_PACK_32(v_int32, int, v_int64, 32, i32, i64, __riscv_vnclip, __riscv_vnsra)
+
+#define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, hwidth, width, hsuffix, suffix, cast, hvl, vl) \
+inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
+{ \
+    return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, vl)), 0, 0, vl); \
+} \
+inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m2(__riscv_vmax(a, 0, vl)), 0, 0, vl), hvl); \
+} \
+template<int N = 0> inline \
+_Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b, int n = N) \
+{ \
+    return __riscv_vnclipu(cast(__riscv_vmax(__riscv_vset(__riscv_vlmul_ext_##suffix##m4(a), 1, b), 0, vl)), n, 0, vl); \
+} \
+template<int N = 0> inline \
+void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a, int n = N) \
+{ \
+    __riscv_vse##hwidth##_v_##hsuffix##m1(ptr, __riscv_vnclipu(__riscv_vreinterpret_u##width##m2(__riscv_vmax(a, 0, vl)), n, 0, vl), hvl); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8, uchar, v_int16, short, 8, 16, u8, i16, __riscv_vreinterpret_v_i16m4_u16m4, VTraits<v_int16>::vlanes(), VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16, ushort, v_int32, int, 16, 32, u16, i32,  __riscv_vreinterpret_v_i32m4_u32m4, VTraits<v_int32>::vlanes(), VTraits<v_uint16>::vlanes())
+
+
+/* void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1)
+  a0 = {A1 A2 A3 A4}
+  a1 = {B1 B2 B3 B4}
+---------------
+  {A1 B1 A2 B2} and {A3 B3 A4 B4}
+*/
+
+#define OPENCV_HAL_IMPL_RVV_ZIP(_Tpvec, _wTpvec, suffix, width, width2, convert2um2, convert2um1) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) { \
+    _wTpvec temp = __riscv_vreinterpret_##suffix##m4(convert2um2( \
+        __riscv_vor(__riscv_vzext_vf2(convert2um1(a0), VTraits<_Tpvec>::vlanes()*2), \
+            __riscv_vreinterpret_u##width2##m4(__riscv_vslide1up(__riscv_vreinterpret_u##width##m4(__riscv_vzext_vf2(convert2um1(a1), VTraits<_Tpvec>::vlanes()*2)), 0, VTraits<_Tpvec>::vlanes()*2)), \
+            VTraits<_Tpvec>::vlanes()))); \
+    b0 = __riscv_vget_##suffix##m2(temp, 0); \
+    b1 = __riscv_vget_##suffix##m2(temp, 1); \
+}
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint8, vuint8m4_t, u8, 8, 16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int8, vint8m4_t, i8, 8, 16, __riscv_vreinterpret_u8m4, __riscv_vreinterpret_u8m2)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint16, vuint16m4_t, u16, 16, 32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int16, vint16m4_t, i16, 16, 32, __riscv_vreinterpret_u16m4, __riscv_vreinterpret_u16m2)
+OPENCV_HAL_IMPL_RVV_ZIP(v_uint32, vuint32m4_t, u32, 32, 64, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_ZIP(v_int32, vint32m4_t, i32, 32, 64, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2)
+OPENCV_HAL_IMPL_RVV_ZIP(v_float32, vfloat32m4_t, f32, 32, 64, __riscv_vreinterpret_u32m4, __riscv_vreinterpret_u32m2)
+
+#if CV_SIMD_SCALABLE_64F
+inline void v_zip(const v_float64& a0, const v_float64& a1, v_float64& b0, v_float64& b1) { \
+    vuint16mf2_t idx0 = __riscv_vid_v_u16mf2(VTraits<v_float64>::vlanes());
+    vuint16mf2_t idx1 = __riscv_vadd(idx0, VTraits<v_float64>::vlanes(), VTraits<v_float64>::vlanes());
+    vuint16m1_t idx = __riscv_vreinterpret_u16m1(( \
+        __riscv_vor(__riscv_vzext_vf2(idx0, VTraits<v_float64>::vlanes()), \
+            __riscv_vreinterpret_u32m1(__riscv_vslide1up(__riscv_vreinterpret_u16m1(__riscv_vzext_vf2(idx1, VTraits<v_float64>::vlanes())), 0, VTraits<v_uint32>::vlanes())), \
+            VTraits<v_uint32>::vlanes())));
+#if 0
+    vfloat64m4_t temp = __riscv_vcreate_v_f64m2_f64m4(a0, a1);
+#else // TODO: clean up when RVV Intrinsic is frozen.
+    vfloat64m4_t temp = __riscv_vlmul_ext_f64m4(a0);
+    temp = __riscv_vset(temp, 1, a1);
+#endif
+    temp = __riscv_vrgatherei16(temp, idx, VTraits<v_float64>::vlanes()*2);
+    b0 = __riscv_vget_f64m2(temp, 0); \
+    b1 = __riscv_vget_f64m2(temp, 1); \
+}
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, width) \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup(a, b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes());\
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return __riscv_vslideup( \
+            __riscv_vslidedown(a, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            __riscv_vslidedown(b, VTraits<_Tpvec>::vlanes()/2, VTraits<_Tpvec>::vlanes()), \
+            VTraits<_Tpvec>::vlanes()/2, \
+            VTraits<_Tpvec>::vlanes()); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c = v_combine_low(a, b); \
+    d = v_combine_high(a, b); \
+}
+
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int8, 8)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int16, 16)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_uint32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_int32, 32)
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float32, 32)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_UNPACKS(v_float64, 64)
+#endif
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width, hwidth, vl) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    a = __riscv_vlse##width##_v_##suffix##m2(ptr  , sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m2(ptr+1, sizeof(_Tp)*2, VTraits<v_##_Tpvec>::vlanes()); \
+}\
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    a = __riscv_vlse##width##_v_##suffix##m2(ptr  , sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m2(ptr+1, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+    c = __riscv_vlse##width##_v_##suffix##m2(ptr+2, sizeof(_Tp)*3, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    \
+    a = __riscv_vlse##width##_v_##suffix##m2(ptr  , sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    b = __riscv_vlse##width##_v_##suffix##m2(ptr+1, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    c = __riscv_vlse##width##_v_##suffix##m2(ptr+2, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+    d = __riscv_vlse##width##_v_##suffix##m2(ptr+3, sizeof(_Tp)*4, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    __riscv_vsse##width(ptr, sizeof(_Tp)*2, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+1, sizeof(_Tp)*2, b, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    __riscv_vsse##width(ptr, sizeof(_Tp)*3, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+1, sizeof(_Tp)*3, b, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+2, sizeof(_Tp)*3, c, VTraits<v_##_Tpvec>::vlanes()); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    __riscv_vsse##width(ptr, sizeof(_Tp)*4, a, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+1, sizeof(_Tp)*4, b, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+2, sizeof(_Tp)*4, c, VTraits<v_##_Tpvec>::vlanes()); \
+    __riscv_vsse##width(ptr+3, sizeof(_Tp)*4, d, VTraits<v_##_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8, uchar, u8, 8, 4, VTraits<v_uint8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8, schar, i8, 8, 4, VTraits<v_int8>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16, ushort, u16, 16, 8, VTraits<v_uint16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16, short, i16, 16, 8, VTraits<v_int16>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32, unsigned, u32, 32, 16, VTraits<v_uint32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32, int, i32, 32, 16, VTraits<v_int32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32, float, f32, 32, 16, VTraits<v_float32>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64, uint64, u64, 64, 32, VTraits<v_uint64>::vlanes())
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64, int64, i64, 64, 32, VTraits<v_int64>::vlanes())
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64, double, f64, 64, 32, VTraits<v_float64>::vlanes())
+#endif
+
+static uint64_t idx_interleave_pairs[] = { \
+    0x0705060403010200, 0x0f0d0e0c0b090a08, 0x1715161413111210, 0x1f1d1e1c1b191a18, \
+    0x2725262423212220, 0x2f2d2e2c2b292a28, 0x3735363433313230, 0x3f3d3e3c3b393a38, \
+    0x4745464443414240, 0x4f4d4e4c4b494a48, 0x5755565453515250, 0x5f5d5e5c5b595a58, \
+    0x6765666463616260, 0x6f6d6e6c6b696a68, 0x7775767473717270, 0x7f7d7e7c7b797a78};
+
+static uint64_t idx_interleave_quads[] = { \
+    0x0703060205010400, 0x0f0b0e0a0d090c08, 0x1713161215111410, 0x1f1b1e1a1d191c18, \
+    0x2723262225212420, 0x2f2b2e2a2d292c28, 0x3733363235313430, 0x3f3b3e3a3d393c38, \
+    0x4743464245414440, 0x4f4b4e4a4d494c48, 0x5753565255515450, 0x5f5b5e5a5d595c58, \
+    0x6763666265616460, 0x6f6b6e6a6d696c68, 0x7773767275717470, 0x7f7b7e7a7d797c78};
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(_Tpvec, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint8m2_t vidx = __riscv_vundefined_u8m2();\
+    vidx = __riscv_vreinterpret_u8m2(__riscv_vle64_v_u64m2(idx_interleave_##func, 16)); \
+    return __riscv_vrgather(vec, vidx, VTraits<v_uint8>::vlanes()); \
+}
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_uint8, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ_NOEXPEND(v_int8, quads)
+
+#define OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(_Tpvec, width, vzext_vfx, func) \
+inline _Tpvec v_interleave_##func(const _Tpvec& vec) { \
+    CV_CheckLE(VTraits<_Tpvec>::vlanes(), VTraits<_Tpvec>::max_nlanes, "RVV implementation only supports VLEN in the range [128, 1024]"); \
+    vuint##width##m2_t vidx = __riscv_vundefined_u##width##m2();\
+    vidx = __riscv_vget_u##width##m2(vzext_vfx(__riscv_vreinterpret_u8m2(__riscv_vle64_v_u64m2(idx_interleave_##func, 16)), VTraits<v_uint8>::vlanes()), 0); \
+    return __riscv_vrgather(vec, vidx, VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, __riscv_vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, __riscv_vzext_vf2, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, __riscv_vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, __riscv_vzext_vf4, pairs)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, __riscv_vzext_vf4, pairs)
+
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint16, 16, __riscv_vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int16, 16, __riscv_vzext_vf2, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_uint32, 32, __riscv_vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_int32, 32, __riscv_vzext_vf4, quads)
+OPENCV_HAL_IMPL_RVV_INTERLEAVED_PQ(v_float32, 32, __riscv_vzext_vf4, quads)
+
+//////////// PopCount //////////
+static const unsigned char popCountTable[256] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+#define OPENCV_HAL_IMPL_RVV_HADD(_Tpvec, _Tpvec2, _Tm2, width, width2, suffix, add) \
+static inline _Tpvec2 v_hadd(_Tpvec a) { \
+    vuint##width2##m2_t oneX2 = __riscv_vmv_v_x_u##width2##m2(1, VTraits<v_uint##width2>::vlanes()); \
+    vuint##width##m2_t one = __riscv_vreinterpret_u##width##m2(oneX2); \
+    _Tm2 res = add(a, __riscv_vslide1down(a, 0, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()); \
+    return __riscv_vget_##suffix##m2(__riscv_vcompress(res, __riscv_vmseq(one, 1, VTraits<v_uint##width>::vlanes()), VTraits<v_uint##width>::vlanes()), 0); \
+}
+OPENCV_HAL_IMPL_RVV_HADD(v_uint8, v_uint16, vuint16m4_t, 8, 16, u16, __riscv_vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_uint16, v_uint32, vuint32m4_t, 16, 32, u32, __riscv_vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_uint32, v_uint64, vuint64m4_t, 32, 64, u64, __riscv_vwaddu_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int8, v_int16, vint16m4_t, 8, 16, i16, __riscv_vwadd_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int16, v_int32, vint32m4_t, 16, 32, i32, __riscv_vwadd_vv)
+OPENCV_HAL_IMPL_RVV_HADD(v_int32, v_int64, vint64m4_t, 32, 64, i64, __riscv_vwadd_vv)
+
+OPENCV_HAL_IMPL_RVV_HADD(vint32m4_t, v_int32, vint32m4_t, 16, 32, i32, __riscv_vadd)
+OPENCV_HAL_IMPL_RVV_HADD(vint64m4_t, v_int64, vint64m4_t, 32, 64, i64, __riscv_vadd)
+
+inline v_uint8 v_popcount(const v_uint8& a)
+{
+    return __riscv_vloxei8(popCountTable, a, VTraits<v_uint8>::vlanes());
+}
+inline v_uint16 v_popcount(const v_uint16& a)
+{
+    return v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a)));
+}
+inline v_uint32 v_popcount(const v_uint32& a)
+{
+    return v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a))));
+}
+inline v_uint64 v_popcount(const v_uint64& a)
+{
+    return v_hadd(v_hadd(v_hadd(v_popcount(__riscv_vreinterpret_u8m2(a)))));
+}
+
+inline v_uint8 v_popcount(const v_int8& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint16 v_popcount(const v_int16& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint32 v_popcount(const v_int32& a)
+{
+    return v_popcount(v_abs(a));\
+}
+inline v_uint64 v_popcount(const v_int64& a)
+{
+    // max(0 - a) is used, since v_abs does not support 64-bit integers.
+    return v_popcount(v_reinterpret_as_u64(__riscv_vmax(a, v_sub(v_setzero_s64(), a), VTraits<v_int64>::vlanes())));
+}
+
+
+//////////// SignMask ////////////
+#define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    uint8_t ans[4] = {0}; \
+    __riscv_vsm(ans, __riscv_vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+    return *(reinterpret_cast<int*>(ans)) & (((__int128_t)1 << VTraits<_Tpvec>::vlanes()) - 1); \
+} \
+inline int v_scan_forward(const _Tpvec& a) \
+{ \
+    return (int)__riscv_vfirst(__riscv_vmslt(a, 0, VTraits<_Tpvec>::vlanes()), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int8)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int16)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int32)
+OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_int64)
+
+inline int64 v_signmask(const v_uint8& a)
+{ return v_signmask(v_reinterpret_as_s8(a)); }
+inline int64 v_signmask(const v_uint16& a)
+{ return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float32& a)
+{ return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_signmask(const v_float64& a)
+{ return v_signmask(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Scan forward ////////////
+inline int v_scan_forward(const v_uint8& a)
+{ return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_uint16& a)
+{ return v_scan_forward(v_reinterpret_as_s16(a)); }
+inline int v_scan_forward(const v_uint32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_float32& a)
+{ return v_scan_forward(v_reinterpret_as_s32(a)); }
+inline int v_scan_forward(const v_uint64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#if CV_SIMD_SCALABLE_64F
+inline int v_scan_forward(const v_float64& a)
+{ return v_scan_forward(v_reinterpret_as_s64(a)); }
+#endif
+
+//////////// Pack triplets ////////////
+// {A0, A1, A2, A3, B0, B1, B2, B3, C0 ...} --> {A0, A1, A2, B0, B1, B2, C0 ...}
+// mask: {0,0,0,1, ...} -> {T,T,T,F, ...}
+#define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, v_trunc) \
+inline _Tpvec v_pack_triplets(const _Tpvec& vec) { \
+    size_t vl = VTraits<v_uint8>::vlanes(); \
+    vuint32m2_t one = __riscv_vmv_v_x_u32m2(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m2_t zero = __riscv_vmv_v_x_u8m2(0, vl); \
+    vuint8m2_t mask = __riscv_vreinterpret_u8m2(one); \
+    return __riscv_vcompress(vec, __riscv_vmseq(v_trunc(__riscv_vslideup(zero, mask, 3, vl)), 0, vl), VTraits<_Tpvec>::vlanes()); \
+}
+
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16, __riscv_vlmul_trunc_u8m1)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16, __riscv_vlmul_trunc_u8m1)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32, __riscv_vlmul_trunc_u8mf2)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32, __riscv_vlmul_trunc_u8mf2)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32, __riscv_vlmul_trunc_u8mf2)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint64, __riscv_vlmul_trunc_u8mf4)
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int64, __riscv_vlmul_trunc_u8mf4)
+#if CV_SIMD_SCALABLE_64F
+OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float64, __riscv_vlmul_trunc_u8mf4)
+#endif
+
+
+////// FP16 support ///////
+
+#if defined(__riscv_zfh) && __riscv_zfh
+inline v_float32 v_load_expand(const hfloat* ptr)
+{
+    return __riscv_vfwcvt_f(__riscv_vle16_v_f16m1((_Float16*)ptr, VTraits<v_float32>::vlanes()) ,VTraits<v_float32>::vlanes());;
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
+{
+    __riscv_vse16_v_f16m1((_Float16*)ptr, __riscv_vfncvt_f_f_w_f16m1(v, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+}
+#else
+inline v_float32 v_load_expand(const hfloat* ptr)
+{
+    float buf[32];
+    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(hfloat* ptr, const v_float32& v)
+{
+    float buf[32];
+    v_store(buf, v);
+    for( int i = 0; i < VTraits<v_float32>::vlanes(); i++ ) ptr[i] = hfloat(buf[i]);
+}
+#endif
+////////////// Rounding //////////////
+inline v_int32 v_round(const v_float32& a)
+{
+    // return vfcvt_x(vfadd(a, 1e-6, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+    return __riscv_vfcvt_x(a, VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_floor(const v_float32& a)
+{
+    return __riscv_vfcvt_x(__riscv_vfsub(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+    // return vfcvt_x(a, VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_ceil(const v_float32& a)
+{
+    return __riscv_vfcvt_x(__riscv_vfadd(a, 0.5f - 1e-5, VTraits<v_float32>::vlanes()), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_trunc(const v_float32& a)
+{
+    return __riscv_vfcvt_rtz_x(a, VTraits<v_float32>::vlanes());
+}
+#if CV_SIMD_SCALABLE_64F
+inline v_int32 v_round(const v_float64& a)
+{
+    return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(a), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_round(const v_float64& a, const v_float64& b)
+{
+    // return vfncvt_x(vset(vlmul_ext_f64m2(vfadd(a, 1e-6, VTraits<v_float64>::vlanes())), 1, b), VTraits<v_float32>::vlanes());
+    // Fix https://github.com/opencv/opencv/issues/24746
+    return __riscv_vfncvt_x(__riscv_vset(__riscv_vlmul_ext_f64m4(a), 1, b), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_floor(const v_float64& a)
+{
+    return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(__riscv_vfsub(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_ceil(const v_float64& a)
+{
+    return __riscv_vfncvt_x(__riscv_vlmul_ext_f64m4(__riscv_vfadd(a, 0.5f - 1e-6, VTraits<v_float64>::vlanes())), VTraits<v_float32>::vlanes());
+}
+
+inline v_int32 v_trunc(const v_float64& a)
+{
+    return __riscv_vfncvt_rtz_x(__riscv_vlmul_ext_f64m4(a), VTraits<v_float32>::vlanes());
+}
+#endif
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32 v_dotprod(const v_int16& a, const v_int16& b)
+{
+    vint32m4_t temp1 = __riscv_vwmul(a, b, VTraits<v_int16>::vlanes());
+    return v_hadd(temp1);
+}
+
+inline v_int32 v_dotprod(const v_int16& a, const v_int16& b, const v_int32& c)
+{
+    vint32m4_t temp1 = __riscv_vwmul(a, b, VTraits<v_int16>::vlanes());
+    return __riscv_vadd(v_hadd(temp1), c, VTraits<v_int32>::vlanes());
+}
+
+// 32 >> 64
+inline v_int64 v_dotprod(const v_int32& a, const v_int32& b)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint32m2_t one32 = __riscv_vreinterpret_u32m2(one64); \
+    vbool16_t mask = __riscv_vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
+    vint64m4_t temp1 = __riscv_vwmul(a, b, VTraits<v_int32>::vlanes()); \
+    vint64m4_t temp2 = __riscv_vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
+    vint64m4_t res = __riscv_vadd(temp1, temp2, VTraits<v_int32>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int32>::vlanes()); \
+    return __riscv_vlmul_trunc_i64m2(res); \
+}
+inline v_int64 v_dotprod(const v_int32& a, const v_int32& b, const v_int64& c)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint32m2_t one32 = __riscv_vreinterpret_u32m2(one64); \
+    vbool16_t mask = __riscv_vmseq(one32, 1, VTraits<v_uint32>::vlanes()); \
+    vint64m4_t temp1 = __riscv_vwmul(a, b, VTraits<v_int32>::vlanes()); \
+    vint64m4_t temp2 = __riscv_vslide1down(temp1, 0, VTraits<v_int32>::vlanes());
+    vint64m4_t res = __riscv_vadd(temp1, temp2, VTraits<v_int32>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int32>::vlanes()); \
+    return __riscv_vadd(__riscv_vlmul_trunc_i64m2(res), c, VTraits<v_int64>::vlanes()); \
+}
+
+// 8 >> 32
+inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b)
+{
+    vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \
+    vbool4_t mask = __riscv_vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vuint16m4_t t0 = __riscv_vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
+    vuint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
+    vuint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
+    vuint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
+    vuint32m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_uint8>::vlanes()); \
+    return __riscv_vlmul_trunc_u32m2(res);
+}
+
+inline v_uint32 v_dotprod_expand(const v_uint8& a, const v_uint8& b,
+                                  const v_uint32& c)
+{
+    vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \
+    vbool4_t mask = __riscv_vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vuint16m4_t t0 = __riscv_vwmulu(a, b, VTraits<v_uint8>::vlanes()); \
+    vuint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_uint8>::vlanes());
+    vuint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_uint8>::vlanes());
+    vuint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_uint8>::vlanes());
+    vuint32m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits<v_uint8>::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits<v_uint8>::vlanes()), VTraits<v_uint8>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_uint8>::vlanes()); \
+    return __riscv_vadd(__riscv_vlmul_trunc_u32m2(res), c, VTraits<v_uint8>::vlanes());
+}
+
+inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b)
+{
+    vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \
+    vbool4_t mask = __riscv_vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vint16m4_t t0 = __riscv_vwmul(a, b, VTraits<v_int8>::vlanes()); \
+    vint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_int8>::vlanes());
+    vint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_int8>::vlanes());
+    vint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_int8>::vlanes());
+    vint32m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int8>::vlanes()); \
+    return __riscv_vlmul_trunc_i32m2(res);
+}
+
+inline v_int32 v_dotprod_expand(const v_int8& a, const v_int8& b,
+                                  const v_int32& c)
+{
+    vuint32m2_t one32 = __riscv_vmv_v_x_u32m2(1, VTraits<v_uint32>::vlanes()); \
+    vuint8m2_t one8 = __riscv_vreinterpret_u8m2(one32); \
+    vbool4_t mask = __riscv_vmseq(one8, 1, VTraits<v_uint8>::vlanes()); \
+    vint16m4_t t0 = __riscv_vwmul(a, b, VTraits<v_int8>::vlanes()); \
+    vint16m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_int8>::vlanes());
+    vint16m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_int8>::vlanes());
+    vint16m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_int8>::vlanes());
+    vint32m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits<v_int8>::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits<v_int8>::vlanes()), VTraits<v_int8>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int8>::vlanes()); \
+    return __riscv_vadd(__riscv_vlmul_trunc_i32m2(res), c, VTraits<v_int8>::vlanes());
+}
+
+
+// // 16 >> 64
+inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \
+    vbool8_t mask = __riscv_vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vuint32m4_t t0 = __riscv_vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
+    vuint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
+    vuint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
+    vuint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
+    vuint64m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_uint16>::vlanes()); \
+    return __riscv_vlmul_trunc_u64m2(res);
+}
+inline v_uint64 v_dotprod_expand(const v_uint16& a, const v_uint16& b, const v_uint64& c)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \
+    vbool8_t mask = __riscv_vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vuint32m4_t t0 = __riscv_vwmulu(a, b, VTraits<v_uint16>::vlanes()); \
+    vuint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_uint16>::vlanes());
+    vuint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_uint16>::vlanes());
+    vuint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_uint16>::vlanes());
+    vuint64m8_t res = __riscv_vadd(__riscv_vwaddu_vv(t2, t3, VTraits<v_uint16>::vlanes()), __riscv_vwaddu_vv(t0, t1, VTraits<v_uint16>::vlanes()), VTraits<v_uint16>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_uint16>::vlanes()); \
+    return __riscv_vadd(__riscv_vlmul_trunc_u64m2(res), c, VTraits<v_uint16>::vlanes());
+}
+
+inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \
+    vbool8_t mask = __riscv_vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vint32m4_t t0 = __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()); \
+    vint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_int16>::vlanes());
+    vint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_int16>::vlanes());
+    vint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_int16>::vlanes());
+    vint64m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int16>::vlanes()); \
+    return __riscv_vlmul_trunc_i64m2(res);
+}
+inline v_int64 v_dotprod_expand(const v_int16& a, const v_int16& b,
+                                  const v_int64& c)
+{
+    vuint64m2_t one64 = __riscv_vmv_v_x_u64m2(1, VTraits<v_uint64>::vlanes()); \
+    vuint16m2_t one16 = __riscv_vreinterpret_u16m2(one64); \
+    vbool8_t mask = __riscv_vmseq(one16, 1, VTraits<v_uint16>::vlanes()); \
+    vint32m4_t t0 = __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()); \
+    vint32m4_t t1= __riscv_vslide1down(t0, 0, VTraits<v_int16>::vlanes());
+    vint32m4_t t2= __riscv_vslide1down(t1, 0, VTraits<v_int16>::vlanes());
+    vint32m4_t t3= __riscv_vslide1down(t2, 0, VTraits<v_int16>::vlanes());
+    vint64m8_t res = __riscv_vadd(__riscv_vwadd_vv(t2, t3, VTraits<v_int16>::vlanes()), __riscv_vwadd_vv(t0, t1, VTraits<v_int16>::vlanes()), VTraits<v_int16>::vlanes());
+    res = __riscv_vcompress(res, mask, VTraits<v_int16>::vlanes()); \
+    return __riscv_vadd(__riscv_vlmul_trunc_i64m2(res), c, VTraits<v_int16>::vlanes());
+}
+
+// // 32 >> 64f
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_dotprod_expand(const v_int32& a, const v_int32& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64 v_dotprod_expand(const v_int32& a,   const v_int32& b,
+                                    const v_float64& c)
+{ return v_add(v_dotprod_expand(a, b) , c); }
+#endif
+
+//////// Fast Dot Product ////////
+// 16 >> 32
+inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b)
+{
+    vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits<vint32m1_t>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits<v_int32>::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes()));
+}
+inline v_int32 v_dotprod_fast(const v_int16& a, const v_int16& b, const v_int32& c)
+{
+    vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits<vint32m1_t>::vlanes());
+    return  __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits<v_int32>::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes())), VTraits<v_int32>::vlanes());
+}
+
+// 32 >> 64
+inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b)
+{
+    vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits<vint64m1_t>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits<v_int64>::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits<v_int32>::vlanes()), zero,  VTraits<v_int32>::vlanes()));
+}
+inline v_int64 v_dotprod_fast(const v_int32& a, const v_int32& b, const v_int64& c)
+{
+    vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits<vint64m1_t>::vlanes());
+    return  __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits<v_int64>::vlanes()), 0, __riscv_vredsum_tu(zero, __riscv_vwmul(a, b, VTraits<v_int32>::vlanes()), zero,  VTraits<v_int32>::vlanes())), VTraits<v_int64>::vlanes());
+}
+
+
+// 8 >> 32
+inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b)
+{
+    vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, VTraits<vuint32m1_t>::vlanes());
+    auto res = __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero,   VTraits<v_uint8>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_u32m2(0, VTraits<v_uint32>::vlanes()), 0, res);
+}
+inline v_uint32 v_dotprod_expand_fast(const v_uint8& a, const v_uint8& b, const v_uint32& c)
+{
+    vuint32m1_t zero = __riscv_vmv_v_x_u32m1(0, VTraits<vuint32m1_t>::vlanes());
+    auto res = __riscv_vwredsumu_tu(zero, __riscv_vwmulu(a, b, VTraits<v_uint8>::vlanes()), zero,   VTraits<v_uint8>::vlanes());
+    return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_u32m2(0, VTraits<v_uint32>::vlanes()), 0, res), VTraits<v_uint32>::vlanes());
+}
+inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b)
+{
+    vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits<vint32m1_t>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits<v_uint32>::vlanes()), 0, __riscv_vwredsum_tu(zero,  __riscv_vwmul(a, b, VTraits<v_int8>::vlanes()), zero,  VTraits<v_int8>::vlanes()));
+}
+inline v_int32 v_dotprod_expand_fast(const v_int8& a, const v_int8& b, const v_int32& c)
+{
+    vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, VTraits<vint32m1_t>::vlanes());
+    return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i32m2(0, VTraits<v_uint32>::vlanes()), 0, __riscv_vwredsum_tu(zero, __riscv_vwmul(a, b, VTraits<v_int8>::vlanes()), zero,  VTraits<v_int8>::vlanes())), VTraits<v_int32>::vlanes());
+}
+
+// 16 >> 64
+inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b)
+{
+    vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0, VTraits<vuint64m1_t>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_u64m2(0, VTraits<v_uint64>::vlanes()), 0, __riscv_vwredsumu_tu(zero,  __riscv_vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero,  VTraits<v_uint16>::vlanes()));
+}
+inline v_uint64 v_dotprod_expand_fast(const v_uint16& a, const v_uint16& b, const v_uint64& c)
+{
+    vuint64m1_t zero = __riscv_vmv_v_x_u64m1(0, VTraits<vuint64m1_t>::vlanes());
+    return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_u64m2(0, VTraits<v_uint64>::vlanes()), 0, __riscv_vwredsumu_tu(zero,  __riscv_vwmulu(a, b, VTraits<v_uint16>::vlanes()), zero,  VTraits<v_uint16>::vlanes())), VTraits<v_uint64>::vlanes());
+}
+inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b)
+{
+    vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits<vint64m1_t>::vlanes());
+    return __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits<v_int64>::vlanes()), 0, __riscv_vwredsum_tu(zero,  __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes()));
+}
+inline v_int64 v_dotprod_expand_fast(const v_int16& a, const v_int16& b, const v_int64& c)
+{
+    vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, VTraits<vint64m1_t>::vlanes());
+    return __riscv_vadd(c, __riscv_vset(__riscv_vmv_v_x_i64m2(0, VTraits<v_int64>::vlanes()), 0, __riscv_vwredsum_tu(zero,  __riscv_vwmul(a, b, VTraits<v_int16>::vlanes()), zero,  VTraits<v_int16>::vlanes())), VTraits<v_int64>::vlanes());
+}
+
+// 32 >> 64f
+#if CV_SIMD_SCALABLE_64F
+inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b)
+{ return v_cvt_f64(v_dotprod_fast(a, b)); }
+inline v_float64 v_dotprod_expand_fast(const v_int32& a, const v_int32& b, const v_float64& c)
+{ return v_add(v_dotprod_expand_fast(a, b) , c); }
+#endif
+
+// TODO: only 128 bit now.
+inline v_float32 v_matmul(const v_float32& v, const v_float32& mat0,
+                            const v_float32& mat1, const v_float32& mat2,
+                            const v_float32& mat3)
+{
+    vfloat32m2_t res;
+    res = __riscv_vfmul_vf_f32m2(mat0, v_extract_n(v, 0), VTraits<v_float32>::vlanes());
+    res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 1), mat1, VTraits<v_float32>::vlanes());
+    res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 2), mat2, VTraits<v_float32>::vlanes());
+    res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v, 3), mat3, VTraits<v_float32>::vlanes());
+    return res;
+}
+
+// TODO: only 128 bit now.
+inline v_float32 v_matmuladd(const v_float32& v, const v_float32& mat0,
+                               const v_float32& mat1, const v_float32& mat2,
+                               const v_float32& a)
+{
+    vfloat32m2_t res = __riscv_vfmul_vf_f32m2(mat0, v_extract_n(v,0), VTraits<v_float32>::vlanes());
+    res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v,1), mat1, VTraits<v_float32>::vlanes());
+    res = __riscv_vfmacc_vf_f32m2(res, v_extract_n(v,2), mat2, VTraits<v_float32>::vlanes());
+    return __riscv_vfadd(res, a, VTraits<v_float32>::vlanes());
+}
+
+inline void v_cleanup() {}
+
+#include "intrin_math.hpp"
+inline v_float32 v_exp(const v_float32& x) { return v_exp_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_log(const v_float32& x) { return v_log_default_32f<v_float32, v_int32>(x); }
+inline void v_sincos(const v_float32& x, v_float32& s, v_float32& c) { v_sincos_default_32f<v_float32, v_int32>(x, s, c); }
+inline v_float32 v_sin(const v_float32& x) { return v_sin_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_cos(const v_float32& x) { return v_cos_default_32f<v_float32, v_int32>(x); }
+inline v_float32 v_erf(const v_float32& x) { return v_erf_default_32f<v_float32, v_int32>(x); }
+
+inline v_float64 v_exp(const v_float64& x) { return v_exp_default_64f<v_float64, v_int64>(x); }
+inline v_float64 v_log(const v_float64& x) { return v_log_default_64f<v_float64, v_int64>(x); }
+inline void v_sincos(const v_float64& x, v_float64& s, v_float64& c) { v_sincos_default_64f<v_float64, v_int64>(x, s, c); }
+inline v_float64 v_sin(const v_float64& x) { return v_sin_default_64f<v_float64, v_int64>(x); }
+inline v_float64 v_cos(const v_float64& x) { return v_cos_default_64f<v_float64, v_int64>(x); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} //namespace cv
+
+#endif //OPENCV_HAL_INTRIN_RVV_SCALABLE_HPP
diff --git a/3rdParty/opencv2/core/hal/intrin_sse.hpp b/3rdParty/opencv2/core/hal/intrin_sse.hpp
index f323d54b13..88ce25737b 100644
--- a/3rdParty/opencv2/core/hal/intrin_sse.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_sse.hpp
@@ -347,6 +347,8 @@ namespace hal_sse_internal
 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(cast(a.val)); }
 
@@ -364,6 +366,11 @@ inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
 
+template <> inline v_uint64x2 v_setzero_() { return v_setzero_u64(); }
+template <> inline v_int64x2 v_setzero_() { return v_setzero_s64(); }
+template <> inline v_uint64x2 v_setall_(uint64 val) { return v_setall_u64(val); }
+template <> inline v_int64x2 v_setall_(int64 val) { return v_setall_s64(val); }
+
 template<typename _Tpvec> inline
 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
 template<typename _Tpvec> inline
@@ -735,53 +742,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
-    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
     { \
         return _Tpvec(intrin(a.val, b.val)); \
-    } \
-    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-    { \
-        a.val = intrin(a.val, b.val); \
-        return a; \
-    }
-
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int32x4, _v128_mullo_epi32)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
-OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_uint32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_int32x4, _v128_mullo_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_mul, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_div, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_add, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(v_sub, v_int64x2, _mm_sub_epi64)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_SSE_MUL_SAT(v_int8x16,  v_int16x8)
@@ -845,7 +845,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 { return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
@@ -872,7 +872,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
@@ -886,7 +886,7 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -899,7 +899,7 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     return v_int32x4(_mm_add_epi32(p0, p1));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -911,14 +911,14 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     v_expand(c, c0, c1);
     v_expand(d, d0, d1);
 
-    c0 += c1; d0 += d1;
+    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
     return v_uint64x2(_mm_add_epi64(
         _mm_unpacklo_epi64(c0.val, d0.val),
         _mm_unpackhi_epi64(c0.val, d0.val)
     ));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -931,7 +931,7 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     ));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
@@ -939,8 +939,8 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #if CV_SSE4_1
     return v_cvt_f64(v_dotprod(a, b));
 #else
-    v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
-    v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
+    v_float64x2 c = v_mul(v_cvt_f64(a), v_cvt_f64(b));
+    v_float64x2 d = v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b));
 
     return v_float64x2(_mm_add_pd(
         _mm_unpacklo_pd(c.val, d.val),
@@ -949,7 +949,7 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 #endif
 }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -957,13 +957,13 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, cons
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
 { return v_dotprod(a, b); }
 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 32 >> 64
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
 { return v_dotprod(a, b); }
 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod_fast(a, b) + c; }
+{ return v_add(v_dotprod_fast(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
@@ -977,7 +977,7 @@ inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b
     return v_uint32x4(_mm_add_epi32(p0, p1));
 }
 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 {
@@ -994,7 +994,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
 #endif
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1006,34 +1006,34 @@ inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b
     v_expand(c, c0, c1);
     v_expand(d, d0, d1);
 
-    c0 += c1; d0 += d1;
-    return c0 + d0;
+    c0 = v_add(c0, c1); d0 = v_add(d0, d1);
+    return v_add(c0, d0);
 }
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
 {
     v_int32x4 prod = v_dotprod(a, b);
     v_int64x2 c, d;
     v_expand(prod, c, d);
-    return c + d;
+    return v_add(c, d);
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
-{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
+{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_mul(v_cvt_f64_high(a), v_cvt_f64_high(b))); }
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a,   const v_int32x4& b, const v_float64x2& c)
 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
 
 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
-    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
-    inline _Tpvec operator ~ (const _Tpvec& a) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_and, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_or, _Tpvec, _mm_or_##suffix)   \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(v_xor, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec v_not(const _Tpvec& a) \
     { \
         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
     }
@@ -1182,58 +1182,58 @@ inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
 }
 
 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
-inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_eq(const _Tpuvec& a, const _Tpuvec& b) \
 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_ne(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_eq(const _Tpsvec& a, const _Tpsvec& b) \
 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_ne(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_lt(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
 } \
-inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_gt(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
 } \
-inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_le(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     __m128i not_mask = _mm_set1_epi32(-1); \
     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+inline _Tpuvec v_ge(const _Tpuvec& a, const _Tpuvec& b) \
 { \
     __m128i smask = _mm_set1_##suffix(sbit); \
     __m128i not_mask = _mm_set1_epi32(-1); \
     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 } \
-inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_lt(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
 } \
-inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_gt(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
 } \
-inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_le(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
 } \
-inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+inline _Tpsvec v_ge(const _Tpsvec& a, const _Tpsvec& b) \
 { \
     __m128i not_mask = _mm_set1_epi32(-1); \
     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
@@ -1244,17 +1244,17 @@ OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
@@ -1262,17 +1262,17 @@ OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
 
 #if CV_SSE4_1
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq (const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return ~(a == b); }
+inline _Tpvec v_ne (const _Tpvec& a, const _Tpvec& b) \
+{ return v_not(v_eq(a, b)); }
 #else
 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
 { __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
   return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return ~(a == b); }
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+{ return v_not(v_eq(a, b)); }
 #endif
 
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
@@ -1311,17 +1311,17 @@ inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
 /** Absolute difference **/
 
 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b),  v_sub(b, a)); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
     v_int8x16 d = v_sub_wrap(a, b);
-    v_int8x16 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x16 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1329,25 +1329,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    v_int32x4 d = a - b;
-    v_int32x4 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x4 d = v_sub(a, b);
+    v_int32x4 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = a - b;
-    v_int8x16 m = a < b;
-    return (d ^ m) - m;
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
  }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1381,12 +1381,12 @@ inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 } \
 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    _Tpvec res = v_fma(a, a, b*b); \
+    _Tpvec res = v_fma(a, a, v_mul(b, b)); \
     return _Tpvec(_mm_sqrt_##suffix(res.val)); \
 } \
 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 { \
-    return v_fma(a, a, b*b); \
+    return v_fma(a, a, v_mul(b, b)); \
 } \
 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 { \
@@ -1397,19 +1397,19 @@ OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((
 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
 
 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
 } \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
 } \
-inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(srai(a.val, imm)); \
 } \
@@ -1711,9 +1711,9 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_N
 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
 
 inline int v_reduce_sum(const v_int16x8& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 inline unsigned v_reduce_sum(const v_uint16x8& a)
-{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+{ return v_reduce_sum(v_add(v_expand_low(a), v_expand_high(a))); }
 
 inline uint64 v_reduce_sum(const v_uint64x2& a)
 {
@@ -1770,13 +1770,13 @@ inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1805,15 +1805,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
 inline v_uint16x8 v_popcount(const v_uint16x8& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
 }
 inline v_uint32x4 v_popcount(const v_uint32x4& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
@@ -1921,11 +1921,12 @@ OPENCV_HAL_IMPL_SSE_EXPAND(v_int16x8,  v_int32x4,   short,    _v128_cvtepi16_epi
 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint32x4, v_uint64x2,  unsigned, _v128_cvtepu32_epi64)
 OPENCV_HAL_IMPL_SSE_EXPAND(v_int32x4,  v_int64x2,   int,      _v128_cvtepi32_epi64)
 
-#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)  \
-    inline _Tpvec v_load_expand_q(const _Tp* ptr)          \
-    {                                                      \
-        __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);   \
-        return _Tpvec(intrin(a));                          \
+#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin)          \
+    inline _Tpvec v_load_expand_q(const _Tp* ptr)                  \
+    {                                                              \
+        typedef int CV_DECL_ALIGNED(1) unaligned_int;              \
+        __m128i a = _mm_cvtsi32_si128(*(const unaligned_int*)ptr); \
+        return _Tpvec(intrin(a));                                  \
     }
 
 OPENCV_HAL_IMPL_SSE_EXPAND_Q(v_uint32x4, uchar, _v128_cvtepu8_epi32)
@@ -3406,7 +3407,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& v)
 
 ////////////// FP16 support ///////////////////////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
 #if CV_FP16
     return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
@@ -3426,7 +3427,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
 #if CV_FP16
     __m128i fp16_value = _mm_cvtps_ph(v.val, 0);
@@ -3458,6 +3459,21 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/intrin_vsx.hpp b/3rdParty/opencv2/core/hal/intrin_vsx.hpp
index e3f4468ca3..b0e2a3472b 100644
--- a/3rdParty/opencv2/core/hal/intrin_vsx.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_vsx.hpp
@@ -261,6 +261,8 @@ OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
 #define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast)                        \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); }             \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));}          \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); }               \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(_Tp v); }       \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a)  \
 { return _Tpvec((cast)a.val); }
 
@@ -513,48 +515,44 @@ inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d)
 /* Element-wise binary and unary operations */
 /** Arithmetics **/
 #define OPENCV_HAL_IMPL_VSX_BIN_OP(bin_op, _Tpvec, intrin)       \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
-{ return _Tpvec(intrin(a.val, b.val)); }                         \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)   \
-{ a.val = intrin(a.val, b.val); return a; }
-
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint8x16, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16,  vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int8x16, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int16x8, vec_adds)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int16x8, vec_subs)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_uint32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_int32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float32x4, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float32x4, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float32x4, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float32x4, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_float64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_float64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(*, v_float64x2, vec_mul)
-OPENCV_HAL_IMPL_VSX_BIN_OP(/, v_float64x2, vec_div)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_uint64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
-OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int64x2, vec_add)
-OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_int64x2, vec_sub)
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint8x16, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int8x16,  vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int8x16, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int16x8, vec_adds)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int16x8, vec_subs)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_uint32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_int32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float32x4, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float32x4, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float32x4, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float32x4, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_float64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_float64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_mul, v_float64x2, vec_mul)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_div, v_float64x2, vec_div)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_uint64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_uint64x2, vec_sub)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_add, v_int64x2, vec_add)
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_sub, v_int64x2, vec_sub)
 
 // saturating multiply
 #define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec)             \
-    inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+    inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
     {                                                            \
         _Tpwvec c, d;                                            \
         v_mul_expand(a, b, c, d);                                \
         return v_pack(c, d);                                     \
-    }                                                            \
-    inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-    { a = a * b; return a; }
+    }
 
 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_int8x16,  v_int16x8)
 OPENCV_HAL_IMPL_VSX_MUL_SAT(v_uint8x16, v_uint16x8)
@@ -596,9 +594,9 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_mul_wrap, vec_mul)
 
 /** Bitwise shifts **/
 #define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
-inline _Tpvec operator << (const _Tpvec& a, int imm)         \
+inline _Tpvec v_shl(const _Tpvec& a, int imm)                \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
-inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
+inline _Tpvec v_shr(const _Tpvec& a, int imm)                \
 { return _Tpvec(shr(a.val, splfunc(imm))); }                 \
 template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
 { return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
@@ -617,10 +615,10 @@ OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
 
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
-OPENCV_HAL_IMPL_VSX_BIN_OP(&, _Tpvec, vec_and)  \
-OPENCV_HAL_IMPL_VSX_BIN_OP(|, _Tpvec, vec_or)   \
-OPENCV_HAL_IMPL_VSX_BIN_OP(^, _Tpvec, vec_xor)  \
-inline _Tpvec operator ~ (const _Tpvec& a)      \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_and, _Tpvec, vec_and)  \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_or, _Tpvec, vec_or)    \
+OPENCV_HAL_IMPL_VSX_BIN_OP(v_xor, _Tpvec, vec_xor)  \
+inline _Tpvec v_not(const _Tpvec& a)                \
 { return _Tpvec(vec_not(a.val)); }
 
 OPENCV_HAL_IMPL_VSX_LOGIC_OP(v_uint8x16)
@@ -650,17 +648,17 @@ OPENCV_HAL_IMPL_VSX_SELECT(v_float64x2, vec_bdword2_c)
 
 /** Comparison **/
 #define OPENCV_HAL_IMPL_VSX_INT_CMP_OP(_Tpvec)                 \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpeq(a.val, b.val)); }                    \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec V_ne(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpne(a.val, b.val)); }                    \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b)    \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmplt(a.val, b.val)); }                    \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b)    \
+inline _Tpvec V_gt(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpgt(a.val, b.val)); }                    \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmple(a.val, b.val)); }                    \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)   \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b)           \
 { return _Tpvec(vec_cmpge(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint8x16)
@@ -1060,7 +1058,7 @@ OPENCV_HAL_IMPL_VSX_MULADD(v_float32x4)
 OPENCV_HAL_IMPL_VSX_MULADD(v_float64x2)
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
-{ return a * b + c; }
+{ return v_add(v_mul(a,  b), c); }
 
 // TODO: exp, log, sin, cos
 
@@ -1089,12 +1087,12 @@ inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
-{ return v_reinterpret_as_u32(v_max(a, b) - v_min(a, b)); }
+{ return v_reinterpret_as_u32(v_sub(v_max(a, b), v_min(a, b))); }
 
 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
-{ return v_abs(a - b); }
+{ return v_abs(v_sub(a, b)); }
 
 /** Absolute difference for signed integers **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
@@ -1361,7 +1359,7 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
 
 /////// FP16 support ////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
 #if CV_VSX3 && defined(vec_extract_fp_from_shorth)
@@ -1388,7 +1386,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
 #endif
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
 // fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
 #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
@@ -1442,7 +1440,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
     return v_int64x2(vec_add(even, odd));
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 // 8 >> 32
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
@@ -1485,7 +1483,7 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     return v_uint64x2(vec_add(s0, s1));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1495,13 +1493,13 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
 }
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1531,7 +1529,7 @@ inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
     return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
 }
 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
@@ -1544,10 +1542,10 @@ inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
     v_int32x4 prod = v_dotprod(a, b);
     v_int64x2 c, d;
     v_expand(prod, c, d);
-    return c + d;
+    return v_add(c, d);
 }
 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand_fast(a, b) + c; }
+{ return v_add(v_dotprod_expand_fast(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
@@ -1598,6 +1596,19 @@ template<int i, typename Tvec>
 inline Tvec v_broadcast_element(const Tvec& v)
 { return Tvec(vec_splat(v.val, i)); }
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
 
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
diff --git a/3rdParty/opencv2/core/hal/intrin_wasm.hpp b/3rdParty/opencv2/core/hal/intrin_wasm.hpp
index a7d1176f97..d8d27edb3a 100644
--- a/3rdParty/opencv2/core/hal/intrin_wasm.hpp
+++ b/3rdParty/opencv2/core/hal/intrin_wasm.hpp
@@ -10,6 +10,16 @@
 #include <algorithm>
 #include "opencv2/core/saturate.hpp"
 
+
+// Emscripten v2.0.13 (latest officially supported, as of 07/30/2024):
+// __EMSCRIPTEN_major__, __EMSCRIPTEN_minor__ and __EMSCRIPTEN_tiny__ are defined via commandline in
+// https://github.com/emscripten-core/emscripten/blob/1690a5802cd1241adc9714fb7fa2f633d38860dc/tools/shared.py#L506-L515
+//
+// See https://github.com/opencv/opencv/pull/25909
+#ifndef __EMSCRIPTEN_major__
+#include <emscripten/version.h>
+#endif
+
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
 #define CV_SIMD128_FP16 0
@@ -294,35 +304,35 @@ static const unsigned char popCountTable[] =
 }  // namespace
 
 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
+    return wasm_i8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
 }
 
 static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
+    return wasm_i8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
 }
 
 static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
+    return wasm_i8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
 }
 
 static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    return wasm_i8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
 }
 
 static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
+    return wasm_i8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
 }
 
 static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
+    return wasm_i8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
 }
 
 static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
+    return wasm_i8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
 }
 
 static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
-    return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    return wasm_i8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
 }
 
 /** Convert **/
@@ -391,6 +401,8 @@ inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
+template <> inline _Tpvec v_setzero_() { return v_setzero_##suffix(); } \
+template <> inline _Tpvec v_setall_(_Tp v) { return v_setall_##suffix(v); } \
 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 { return _Tpvec(a.val); }
 
@@ -411,7 +423,7 @@ inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
     v128_t maxval = wasm_i16x8_splat(255);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
-    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_uint8x16(wasm_i8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
 {
@@ -421,14 +433,14 @@ inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
-    return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_int8x16(wasm_i8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
 {
     v128_t maxval = wasm_i32x4_splat(65535);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
-    return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_uint16x8(wasm_i8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
 {
@@ -438,15 +450,15 @@ inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
-    return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_int16x8(wasm_i8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
 {
-    return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+    return v_uint32x4(wasm_i8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
 }
 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
 {
-    return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+    return v_int32x4(wasm_i8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
 }
 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
 {
@@ -456,7 +468,7 @@ inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
-    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_uint8x16(wasm_i8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 {
@@ -466,7 +478,7 @@ inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
-    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_uint16x8(wasm_i8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 
 template<int n>
@@ -478,7 +490,7 @@ inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
     v128_t maxval = wasm_i16x8_splat(255);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
-    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_uint8x16(wasm_i8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 template<int n>
 inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
@@ -492,7 +504,7 @@ inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
-    return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_int8x16(wasm_i8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 template<int n>
 inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
@@ -503,7 +515,7 @@ inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
     v128_t maxval = wasm_i32x4_splat(65535);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
-    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_uint16x8(wasm_i8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 template<int n>
 inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
@@ -517,7 +529,7 @@ inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
-    return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_int16x8(wasm_i8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 template<int n>
 inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
@@ -525,7 +537,7 @@ inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
     v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
     v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
-    return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+    return v_uint32x4(wasm_i8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
 }
 template<int n>
 inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
@@ -533,7 +545,7 @@ inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
     v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
     v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
-    return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+    return v_int32x4(wasm_i8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
 }
 template<int n>
 inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
@@ -547,7 +559,7 @@ inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
-    return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_uint8x16(wasm_i8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 template<int n>
 inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
@@ -561,14 +573,14 @@ inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
-    return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+    return v_uint16x8(wasm_i8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
 }
 
 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
 {
     v128_t maxval = wasm_i16x8_splat(255);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
-    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     uchar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -581,7 +593,7 @@ inline void v_pack_store(schar* ptr, const v_int16x8& a)
     v128_t minval = wasm_i16x8_splat(-128);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     schar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -592,7 +604,7 @@ inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
 {
     v128_t maxval = wasm_i32x4_splat(65535);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
-    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     ushort t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -605,7 +617,7 @@ inline void v_pack_store(short* ptr, const v_int32x4& a)
     v128_t minval = wasm_i32x4_splat(-32768);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     short t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -614,7 +626,7 @@ inline void v_pack_store(short* ptr, const v_int32x4& a)
 }
 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
 {
-    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    v128_t r = wasm_i8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
     unsigned t_ptr[4];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<2; ++i) {
@@ -623,7 +635,7 @@ inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
 }
 inline void v_pack_store(int* ptr, const v_int64x2& a)
 {
-    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    v128_t r = wasm_i8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
     int t_ptr[4];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<2; ++i) {
@@ -636,7 +648,7 @@ inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
     v128_t minval = wasm_i16x8_splat(0);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     uchar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -649,7 +661,7 @@ inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
     v128_t minval = wasm_i32x4_splat(0);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     ushort t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -664,7 +676,7 @@ inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
     v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
     v128_t maxval = wasm_i16x8_splat(255);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     uchar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -680,7 +692,7 @@ inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
     v128_t minval = wasm_i16x8_splat(-128);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     schar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -694,7 +706,7 @@ inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
     v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
     v128_t maxval = wasm_i32x4_splat(65535);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
-    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     ushort t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -710,7 +722,7 @@ inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
     v128_t minval = wasm_i32x4_splat(-32768);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     short t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -722,7 +734,7 @@ inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
 {
     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
     v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
-    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    v128_t r = wasm_i8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
     unsigned t_ptr[4];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<2; ++i) {
@@ -734,7 +746,7 @@ inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
 {
     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
     v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
-    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    v128_t r = wasm_i8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
     int t_ptr[4];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<2; ++i) {
@@ -750,7 +762,7 @@ inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
     v128_t minval = wasm_i16x8_splat(0);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    v128_t r = wasm_i8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
     uchar t_ptr[16];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<8; ++i) {
@@ -766,7 +778,7 @@ inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
     v128_t minval = wasm_i32x4_splat(0);
     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
-    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    v128_t r = wasm_i8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
     ushort t_ptr[8];
     wasm_v128_store(t_ptr, r);
     for (int i=0; i<4; ++i) {
@@ -779,7 +791,7 @@ inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
     v128_t maxval = wasm_i16x8_splat(255);
     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
-    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+    return v_uint8x16(wasm_i8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
@@ -790,9 +802,9 @@ inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
     v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
     v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
-    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
-    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
-    return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+    v128_t ab = wasm_i8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    v128_t cd = wasm_i8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    return v_uint8x16(wasm_i8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
 }
 
 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
@@ -808,13 +820,13 @@ inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uin
     v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
     v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
     v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
-    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
-    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
-    v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
-    v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
-    v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
-    v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
-    return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+    v128_t ab = wasm_i8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t cd = wasm_i8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t ef = wasm_i8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t gh = wasm_i8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t abcd = wasm_i8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    v128_t efgh = wasm_i8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    return v_uint8x16(wasm_i8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
 }
 
 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
@@ -848,53 +860,46 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 }
 
 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
-inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec bin_op(const _Tpvec& a, const _Tpvec& b) \
 { \
     return _Tpvec(intrin(a.val, b.val)); \
-} \
-inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
-{ \
-    a.val = intrin(a.val, b.val); \
-    return a; \
-}
-
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
-OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
-OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
-OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint8x16, wasm_u8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint8x16, wasm_u8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int8x16, wasm_i8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int8x16, wasm_i8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint16x8, wasm_u16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint16x8, wasm_u16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int16x8, wasm_i16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int16x8, wasm_i16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_uint32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_int32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float32x4, wasm_f32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float32x4, wasm_f32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float32x4, wasm_f32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float32x4, wasm_f32x4_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_uint64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_uint64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_int64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_int64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_add, v_float64x2, wasm_f64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_sub, v_float64x2, wasm_f64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_mul, v_float64x2, wasm_f64x2_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_div, v_float64x2, wasm_f64x2_div)
 
 // saturating multiply 8-bit, 16-bit
 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
-inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+inline _Tpvec v_mul(const _Tpvec& a, const _Tpvec& b)        \
 {                                                            \
     _Tpwvec c, d;                                            \
     v_mul_expand(a, b, c, d);                                \
     return v_pack(c, d);                                     \
-}                                                            \
-inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
-{ a = a * b; return a; }
+}
 
 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
@@ -959,7 +964,7 @@ inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
     v_expand(b, b0, b1);
     v128_t c = wasm_i32x4_mul(a0.val, b0.val);
     v128_t d = wasm_i32x4_mul(a1.val, b1.val);
-    return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+    return v_int16x8(wasm_i8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
 }
 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
 {
@@ -968,7 +973,7 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
     v_expand(b, b0, b1);
     v128_t c = wasm_i32x4_mul(a0.val, b0.val);
     v128_t d = wasm_i32x4_mul(a1.val, b1.val);
-    return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+    return v_uint16x8(wasm_i8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
 }
 
 //////// Dot Product ////////
@@ -985,7 +990,7 @@ inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 }
 
 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
-{ return v_dotprod(a, b) + c; }
+{ return v_add(v_dotprod(a, b), c); }
 
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 {
@@ -999,7 +1004,7 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
 }
 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
 {
-    return v_dotprod(a, b) + c;
+    return v_add(v_dotprod(a, b), c);
 }
 
 // 8 >> 32
@@ -1009,13 +1014,13 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
     v128_t a1 = wasm_u16x8_shr(a.val, 8);
     v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
     v128_t b1 = wasm_u16x8_shr(b.val, 8);
-    return v_uint32x4((
-        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
-        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    return v_uint32x4((v_add(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)),
+        v_dotprod(v_int16x8(a1), v_int16x8(b1)))).val
     );
 }
 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1023,13 +1028,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
     v128_t a1 = wasm_i16x8_shr(a.val, 8);
     v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
     v128_t b1 = wasm_i16x8_shr(b.val, 8);
-    return v_int32x4(
-        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+    return v_int32x4(v_add(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)),
         v_dotprod(v_int16x8(a1), v_int16x8(b1))
-    );
+    ));
 }
 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 16 >> 64
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
@@ -1038,13 +1043,13 @@ inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
     v128_t a1 = wasm_u32x4_shr(a.val, 16);
     v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
     v128_t b1 = wasm_u32x4_shr(b.val, 16);
-    return v_uint64x2((
-        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+    return v_uint64x2((v_add(
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)),
         v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
-    );
+    ));
 }
 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1052,20 +1057,20 @@ inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
     v128_t a1 = wasm_i32x4_shr(a.val, 16);
     v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
     v128_t b1 = wasm_i32x4_shr(b.val, 16);
-    return v_int64x2((
-        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+    return v_int64x2((v_add(
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)),
         v_dotprod(v_int32x4(a1), v_int32x4(b1)))
-    );
+    ));
 }
 
 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 // 32 >> 64f
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
 { return v_cvt_f64(v_dotprod(a, b)); }
 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
-{ return v_dotprod_expand(a, b) + c; }
+{ return v_add(v_dotprod_expand(a, b), c); }
 
 //////// Fast Dot Product ////////
 
@@ -1108,10 +1113,10 @@ inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b,
 { return v_dotprod_expand(a, b, c); }
 
 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
-OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
-inline _Tpvec operator ~ (const _Tpvec& a) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_and, _Tpvec, wasm_v128_and) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_or, _Tpvec, wasm_v128_or)   \
+OPENCV_HAL_IMPL_WASM_BIN_OP(v_xor, _Tpvec, wasm_v128_xor) \
+inline _Tpvec v_not(const _Tpvec& a) \
 { \
     return _Tpvec(wasm_v128_not(a.val)); \
 }
@@ -1214,17 +1219,17 @@ OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
 
 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b)  \
 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b)  \
 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
-inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_lt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
-inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_gt(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
-inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_le(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
-inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+inline _Tpvec v_ge(const _Tpvec& a, const _Tpvec& b) \
 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
 
 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
@@ -1237,10 +1242,10 @@ OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
 
 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
-inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
-inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
-{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+inline _Tpvec v_eq(const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_eq(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); } \
+inline _Tpvec v_ne(const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_ne(v_reinterpret_as_f64(a), v_reinterpret_as_f64(b))); }
 
 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
@@ -1298,17 +1303,17 @@ OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
 /** Absolute difference **/
 
 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
-{ return v_add_wrap(a - b,  b - a); }
+{ return v_add_wrap(v_sub(a, b), v_sub(b, a)); }
 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
 {
     v_int8x16 d = v_sub_wrap(a, b);
-    v_int8x16 m = a < b;
-    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+    v_int8x16 m = v_lt(a, b);
+    return v_reinterpret_as_u8(v_sub_wrap(v_xor(d, m), m));
 }
 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 {
@@ -1316,25 +1321,25 @@ inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
 }
 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 {
-    v_int32x4 d = a - b;
-    v_int32x4 m = a < b;
-    return v_reinterpret_as_u32((d ^ m) - m);
+    v_int32x4 d = v_sub(a, b);
+    v_int32x4 m = v_lt(a, b);
+    return v_reinterpret_as_u32(v_sub(v_xor(d, m), m));
 }
 
 /** Saturating absolute difference **/
 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
 {
-    v_int8x16 d = a - b;
-    v_int8x16 m = a < b;
-    return (d ^ m) - m;
+    v_int8x16 d = v_sub(a, b);
+    v_int8x16 m = v_lt(a, b);
+    return v_sub(v_xor(d, m), m);
  }
 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
-{ return v_max(a, b) - v_min(a, b); }
+{ return v_sub(v_max(a, b), v_min(a, b)); }
 
 
 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
@@ -1344,12 +1349,12 @@ inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x
 
 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
 {
-    return a * b + c;
+    return v_add(v_mul(a, b), c);
 }
 
 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
@@ -1385,19 +1390,19 @@ OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
 
 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shl(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
 } \
-inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shl(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
 } \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+inline _Tpuvec v_shr(const _Tpuvec& a, int imm) \
 { \
     return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
 } \
-inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+inline _Tpsvec v_shr(const _Tpsvec& a, int imm) \
 { \
     return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
 } \
@@ -1466,7 +1471,7 @@ namespace hal_wasm_internal
         inline v128_t operator()(const v128_t& a, const v128_t& b) const
         {
             enum { imm2 = (sizeof(v128_t) - imm) };
-            return wasm_v8x16_shuffle(a, b,
+            return wasm_i8x16_shuffle(a, b,
                                       imm, imm+1, imm+2, imm+3,
                                       imm+4, imm+5, imm+6, imm+7,
                                       imm+8, imm+9, imm+10, imm+11,
@@ -1577,19 +1582,19 @@ OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
 
 /** Reverse **/
 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
-{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+{ return v_uint8x16(wasm_i8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
 
 inline v_int8x16 v_reverse(const v_int8x16 &a)
 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
 
 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
-{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+{ return v_uint16x8(wasm_i8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
 
 inline v_int16x8 v_reverse(const v_int16x8 &a)
 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
 
 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
-{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+{ return v_uint32x4(wasm_i8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
 
 inline v_int32x4 v_reverse(const v_int32x4 &a)
 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
@@ -1598,7 +1603,7 @@ inline v_float32x4 v_reverse(const v_float32x4 &a)
 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
 
 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
-{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+{ return v_uint64x2(wasm_i8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
 
 inline v_int64x2 v_reverse(const v_int64x2 &a)
 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
@@ -1611,8 +1616,8 @@ inline v_float64x2 v_reverse(const v_float64x2 &a)
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
     regtype val = a.val; \
-    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
-    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
+    val = wasm_##suffix##_add(val, wasm_i8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    val = wasm_##suffix##_add(val, wasm_i8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
     return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
 }
 
@@ -1644,7 +1649,7 @@ OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
 inline scalartype v_reduce_sum(const _Tpvec& a) \
 { \
     regtype val = a.val; \
-    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    val = wasm_##suffix##_add(val, wasm_i8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
     return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
 }
 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
@@ -1693,7 +1698,7 @@ inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
     v_expand(v_absdiff(a, b), l16, h16);
     v_expand(l16, l16_l32, l16_h32);
     v_expand(h16, h16_l32, h16_h32);
-    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+    return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
 }
 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
 {
@@ -1702,19 +1707,19 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
     v_expand(v_absdiff(a, b), l16, h16);
     v_expand(l16, l16_l32, l16_h32);
     v_expand(h16, h16_l32, h16_h32);
-    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+    return v_reduce_sum(v_add(v_add(l16_l32, l16_h32), v_add(h16_l32, h16_h32)));
 }
 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
 {
     v_uint32x4 l, h;
     v_expand(v_absdiff(a, b), l, h);
-    return v_reduce_sum(l + h);
+    return v_reduce_sum(v_add(l, h));
 }
 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
 {
@@ -1743,15 +1748,15 @@ inline v_uint8x16 v_popcount(const v_uint8x16& a)
 inline v_uint16x8 v_popcount(const v_uint16x8& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    return v_and(v_reinterpret_as_u16(p), v_setall_u16(0x00ff));
 }
 inline v_uint32x4 v_popcount(const v_uint32x4& a)
 {
     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
-    p += v_rotate_right<1>(p);
-    p += v_rotate_right<2>(p);
-    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+    p = v_add(p, v_rotate_right<1>(p));
+    p = v_add(p, v_rotate_right<2>(p));
+    return v_and(v_reinterpret_as_u32(p), v_setall_u32(0x000000ff));
 }
 inline v_uint64x2 v_popcount(const v_uint64x2& a)
 {
@@ -1991,8 +1996,8 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
     v128_t t00 = wasm_v128_load(ptr);
     v128_t t01 = wasm_v128_load(ptr + 16);
 
-    a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
-    b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
+    a.val = wasm_i8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
+    b.val = wasm_i8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
 }
 
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
@@ -2001,13 +2006,13 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
     v128_t t01 = wasm_v128_load(ptr + 16);
     v128_t t02 = wasm_v128_load(ptr + 32);
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
-    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
-    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
+    v128_t t10 = wasm_i8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
+    v128_t t11 = wasm_i8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
+    v128_t t12 = wasm_i8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
 
-    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
-    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
-    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
+    a.val = wasm_i8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
+    b.val = wasm_i8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
+    c.val = wasm_i8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
 }
 
 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
@@ -2017,15 +2022,15 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
     v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
     v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
 
-    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
-    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
-    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
-    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+    v128_t v0 = wasm_i8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v1 = wasm_i8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v2 = wasm_i8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+    v128_t v3 = wasm_i8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
 
-    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
-    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    a.val = wasm_i8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_i8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_i8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_i8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
 }
 
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
@@ -2033,8 +2038,8 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
     v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1 a2 b2 a3 b3
     v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
 
-    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
-    b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
+    a.val = wasm_i8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = wasm_i8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
 }
 
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
@@ -2043,13 +2048,13 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
     v128_t t01 = wasm_v128_load(ptr + 8);    // c2 a3 b3 c3 a4 b4 c4 a5
     v128_t t02 = wasm_v128_load(ptr + 16);  // b5 c5 a6 b6 c6 a7 b7 c7
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
-    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
-    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
+    v128_t t10 = wasm_i8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
+    v128_t t11 = wasm_i8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
+    v128_t t12 = wasm_i8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
 
-    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
-    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
-    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
+    a.val = wasm_i8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
+    b.val = wasm_i8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
+    c.val = wasm_i8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
 }
 
 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
@@ -2059,15 +2064,15 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b,
     v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
     v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
 
-    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
-    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
-    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
-    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
+    v128_t v0 = wasm_i8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
+    v128_t v1 = wasm_i8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
+    v128_t v2 = wasm_i8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
+    v128_t v3 = wasm_i8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
 
-    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
-    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    a.val = wasm_i8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_i8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_i8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_i8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
 }
 
 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
@@ -2075,8 +2080,8 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
     v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1
     v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
 
-    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
-    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+    a.val = wasm_i8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_i8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
 }
 
 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
@@ -2085,13 +2090,13 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
     v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
     v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
-    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
-    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+    v128_t t10 = wasm_i8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_i8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_i8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
 
-    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
-    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
-    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+    a.val = wasm_i8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_i8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_i8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
 }
 
 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
@@ -2109,8 +2114,8 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
     v128_t v0 = wasm_v128_load(ptr);       // a0 b0 a1 b1
     v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
 
-    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
-    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+    a.val = wasm_i8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_i8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
 }
 
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
@@ -2119,13 +2124,13 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
     v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
     v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
-    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
-    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+    v128_t t10 = wasm_i8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_i8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_i8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
 
-    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
-    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
-    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+    a.val = wasm_i8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_i8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_i8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
 }
 
 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
@@ -2153,9 +2158,9 @@ inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b,
     v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
     v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
 
-    a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
-    b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
-    c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    a.val = wasm_i8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    b.val = wasm_i8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
+    c.val = wasm_i8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
 }
 
 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
@@ -2187,13 +2192,13 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                 const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
 {
-    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
-    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
-    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
+    v128_t t00 = wasm_i8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
+    v128_t t01 = wasm_i8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
+    v128_t t02 = wasm_i8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
-    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
-    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
+    v128_t t10 = wasm_i8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
+    v128_t t11 = wasm_i8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
+    v128_t t12 = wasm_i8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
 
     wasm_v128_store(ptr, t10);
     wasm_v128_store(ptr + 16, t11);
@@ -2238,13 +2243,13 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
                                 const v_uint16x8& b, const v_uint16x8& c,
                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
 {
-    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
-    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
-    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
+    v128_t t00 = wasm_i8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
+    v128_t t01 = wasm_i8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
+    v128_t t02 = wasm_i8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
-    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
-    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
+    v128_t t10 = wasm_i8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
+    v128_t t11 = wasm_i8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
+    v128_t t12 = wasm_i8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
 
     wasm_v128_store(ptr, t10);
     wasm_v128_store(ptr + 8, t11);
@@ -2288,13 +2293,13 @@ inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint
 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
                                 const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
 {
-    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
-    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
-    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+    v128_t t00 = wasm_i8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_i8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_i8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
-    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
-    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+    v128_t t10 = wasm_i8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_i8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_i8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
 
     wasm_v128_store(ptr, t10);
     wasm_v128_store(ptr + 4, t11);
@@ -2328,13 +2333,13 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32
 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
                                const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
 {
-    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
-    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
-    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+    v128_t t00 = wasm_i8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_i8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_i8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
 
-    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
-    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
-    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+    v128_t t10 = wasm_i8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_i8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_i8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
 
     wasm_v128_store(ptr, t10);
     wasm_v128_store(ptr + 4, t11);
@@ -2367,9 +2372,9 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x
 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
                                const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
 {
-    v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
-    v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
-    v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    v128_t v0 = wasm_i8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    v128_t v1 = wasm_i8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t v2 = wasm_i8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
 
     wasm_v128_store(ptr, v0);
     wasm_v128_store(ptr + 2, v1);
@@ -2682,45 +2687,45 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
 
 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
 {
-    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
+    return v_int8x16(wasm_i8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
 }
 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
 {
-    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+    return v_int8x16(wasm_i8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
 }
 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
 {
-    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
+    return v_int16x8(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
 }
 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
 {
-    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
+    return v_int16x8(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
 }
 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
 
 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
 {
-    return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+    return v_int32x4(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
 }
 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
 {
-    return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+    return v_float32x4(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
 }
 
 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
 {
-    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
+    return v_int8x16(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
 }
 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
 
 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
 {
-    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
+    return v_int16x8(wasm_i8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
 }
 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
 
@@ -2753,7 +2758,7 @@ inline v_float32x4 v_broadcast_element(const v_float32x4& a)
 
 ////////////// FP16 support ///////////////////////////
 
-inline v_float32x4 v_load_expand(const float16_t* ptr)
+inline v_float32x4 v_load_expand(const hfloat* ptr)
 {
     float a[4];
     for (int i = 0; i < 4; i++)
@@ -2761,18 +2766,32 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
     return v_float32x4(wasm_v128_load(a));
 }
 
-inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+inline void v_pack_store(hfloat* ptr, const v_float32x4& v)
 {
     double v_[4];
     wasm_v128_store(v_, v.val);
-    ptr[0] = float16_t(v_[0]);
-    ptr[1] = float16_t(v_[1]);
-    ptr[2] = float16_t(v_[2]);
-    ptr[3] = float16_t(v_[3]);
+    ptr[0] = hfloat(v_[0]);
+    ptr[1] = hfloat(v_[1]);
+    ptr[2] = hfloat(v_[2]);
+    ptr[3] = hfloat(v_[3]);
 }
 
 inline void v_cleanup() {}
 
+#include "intrin_math.hpp"
+inline v_float32x4 v_exp(const v_float32x4& x) { return v_exp_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_log(const v_float32x4& x) { return v_log_default_32f<v_float32x4, v_int32x4>(x); }
+inline void v_sincos(const v_float32x4& x, v_float32x4& s, v_float32x4& c) { v_sincos_default_32f<v_float32x4, v_int32x4>(x, s, c); }
+inline v_float32x4 v_sin(const v_float32x4& x) { return v_sin_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_cos(const v_float32x4& x) { return v_cos_default_32f<v_float32x4, v_int32x4>(x); }
+inline v_float32x4 v_erf(const v_float32x4& x) { return v_erf_default_32f<v_float32x4, v_int32x4>(x); }
+
+inline v_float64x2 v_exp(const v_float64x2& x) { return v_exp_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_log(const v_float64x2& x) { return v_log_default_64f<v_float64x2, v_int64x2>(x); }
+inline void v_sincos(const v_float64x2& x, v_float64x2& s, v_float64x2& c) { v_sincos_default_64f<v_float64x2, v_int64x2>(x, s, c); }
+inline v_float64x2 v_sin(const v_float64x2& x) { return v_sin_default_64f<v_float64x2, v_int64x2>(x); }
+inline v_float64x2 v_cos(const v_float64x2& x) { return v_cos_default_64f<v_float64x2, v_int64x2>(x); }
+
 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
 
 //! @endcond
diff --git a/3rdParty/opencv2/core/hal/simd_utils.impl.hpp b/3rdParty/opencv2/core/hal/simd_utils.impl.hpp
index 12b0b8f6b4..14e3dfd56f 100644
--- a/3rdParty/opencv2/core/hal/simd_utils.impl.hpp
+++ b/3rdParty/opencv2/core/hal/simd_utils.impl.hpp
@@ -128,8 +128,48 @@ template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const
 
 #endif  // SIMD512
 
+#if CV_SIMD_SCALABLE
+template<typename _T> struct Type2Vec_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uchar, v_uint8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(schar, v_int8);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(ushort, v_uint16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(short, v_int16);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(unsigned, v_uint32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int, v_int32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(float, v_float32);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(uint64, v_uint64);
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(int64, v_int64);
+#if CV_SIMD_SCALABLE_64F
+CV_INTRIN_DEF_TYPE2VEC_TRAITS(double, v_float64);
+#endif
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD_SCALABLE_64F
+template<> inline Type2Vec_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+#endif
 
-#if CV_SIMD_WIDTH == 16
+
+#if CV_SIMD_SCALABLE
+template<typename _T> static inline
+typename Type2Vec_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 16
 template<typename _T> static inline
 typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
 #elif CV_SIMD_WIDTH == 32
diff --git a/3rdParty/opencv2/core/mat.hpp b/3rdParty/opencv2/core/mat.hpp
index 6cc8c79a83..be1b8faecb 100644
--- a/3rdParty/opencv2/core/mat.hpp
+++ b/3rdParty/opencv2/core/mat.hpp
@@ -53,6 +53,7 @@
 
 #include "opencv2/core/bufferpool.hpp"
 
+#include <array>
 #include <type_traits>
 
 namespace cv
@@ -78,9 +79,10 @@ It is defined as:
 @code
     typedef const _InputArray& InputArray;
 @endcode
-where _InputArray is a class that can be constructed from `Mat`, `Mat_<T>`, `Matx<T, m, n>`,
-`std::vector<T>`, `std::vector<std::vector<T> >`, `std::vector<Mat>`, `std::vector<Mat_<T> >`,
-`UMat`, `std::vector<UMat>` or `double`. It can also be constructed from a matrix expression.
+where \ref cv::_InputArray is a class that can be constructed from \ref cv::Mat, \ref cv::Mat_<T>,
+\ref cv::Matx<T, m, n>, std::vector<T>, std::vector<std::vector<T>>, std::vector<Mat>,
+std::vector<Mat_<T>>, \ref cv::UMat, std::vector<UMat> or `double`. It can also be constructed from
+a matrix expression.
 
 Since this is mostly implementation-level class, and its interface may change in future versions, we
 do not describe it in details. There are a few key things, though, that should be kept in mind:
@@ -445,6 +447,22 @@ typedef OutputArray OutputArrayOfArrays;
 typedef const _InputOutputArray& InputOutputArray;
 typedef InputOutputArray InputOutputArrayOfArrays;
 
+/** @brief Returns an empty InputArray or OutputArray.
+
+ This function is used to provide an "empty" or "null" array when certain functions
+ take optional input or output arrays that you don't want to provide.
+
+ Many OpenCV functions accept optional arguments as `cv::InputArray` or `cv::OutputArray`.
+ When you don't want to pass any data for these optional parameters, you can use `cv::noArray()`
+ to indicate that you are omitting them.
+
+ @return An empty `cv::InputArray` or `cv::OutputArray` that can be used as a placeholder.
+
+ @note This is often used when a function has optional arrays, and you do not want to
+ provide a specific input or output array.
+
+ @see cv::InputArray, cv::OutputArray
+ */
 CV_EXPORTS InputOutputArray noArray();
 
 /////////////////////////////////// MatAllocator //////////////////////////////////////
@@ -1289,15 +1307,36 @@ class CV_EXPORTS Mat
                              t(); // finally, transpose the Nx3 matrix.
                                   // This involves copying all the elements
     @endcode
+    3-channel 2x2 matrix reshaped to 1-channel 4x3 matrix, each column has values from one of original channels:
+    @code
+    Mat m(Size(2, 2), CV_8UC3, Scalar(1, 2, 3));
+    vector<int> new_shape {4, 3};
+    m = m.reshape(1, new_shape);
+    @endcode
+    or:
+    @code
+    Mat m(Size(2, 2), CV_8UC3, Scalar(1, 2, 3));
+    const int new_shape[] = {4, 3};
+    m = m.reshape(1, 2, new_shape);
+    @endcode
     @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
     @param rows New number of rows. If the parameter is 0, the number of rows remains the same.
      */
     Mat reshape(int cn, int rows=0) const;
 
-    /** @overload */
+    /** @overload
+     * @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+     * @param newndims New number of dimentions.
+     * @param newsz Array with new matrix size by all dimentions. If some sizes are zero,
+     * the original sizes in those dimensions are presumed.
+     */
     Mat reshape(int cn, int newndims, const int* newsz) const;
 
-    /** @overload */
+    /** @overload
+     * @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
+     * @param newshape Vector with new matrix size by all dimentions. If some sizes are zero,
+     * the original sizes in those dimensions are presumed.
+     */
     Mat reshape(int cn, const std::vector<int>& newshape) const;
 
     /** @brief Transposes a matrix.
@@ -2097,7 +2136,7 @@ class CV_EXPORTS Mat
     /** @overload */
     template<typename _Tp, typename Functor> void forEach(const Functor& operation) const;
 
-    Mat(Mat&& m);
+    Mat(Mat&& m) CV_NOEXCEPT;
     Mat& operator = (Mat&& m);
 
     enum { MAGIC_VAL  = 0x42FF0000, AUTO_STEP = 0, CONTINUOUS_FLAG = CV_MAT_CONT_FLAG, SUBMATRIX_FLAG = CV_SUBMAT_FLAG };
diff --git a/3rdParty/opencv2/core/mat.inl.hpp b/3rdParty/opencv2/core/mat.inl.hpp
index 940cd406b0..0d9c557bce 100644
--- a/3rdParty/opencv2/core/mat.inl.hpp
+++ b/3rdParty/opencv2/core/mat.inl.hpp
@@ -51,7 +51,7 @@
 
 #ifdef _MSC_VER
 #pragma warning( push )
-#pragma warning( disable: 4127 )
+#pragma warning( disable: 4127 5054 )
 #endif
 
 #if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
@@ -100,10 +100,10 @@ inline Size _InputArray::getSz() const { return sz; }
 
 inline _InputArray::_InputArray() { init(0 + NONE, 0); }
 inline _InputArray::_InputArray(int _flags, void* _obj) { init(_flags, _obj); }
-inline _InputArray::_InputArray(const Mat& m) { init(MAT+ACCESS_READ, &m); }
-inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_READ, &vec); }
-inline _InputArray::_InputArray(const UMat& m) { init(UMAT+ACCESS_READ, &m); }
-inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const Mat& m) { init(+MAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<Mat>& vec) { init(+STD_VECTOR_MAT+ACCESS_READ, &vec); }
+inline _InputArray::_InputArray(const UMat& m) { init(+UMAT+ACCESS_READ, &m); }
+inline _InputArray::_InputArray(const std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT+ACCESS_READ, &vec); }
 
 template<typename _Tp> inline
 _InputArray::_InputArray(const std::vector<_Tp>& vec)
@@ -115,7 +115,7 @@ _InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_READ, arr.data(), Size(1, _Nm)); }
 
 inline
 _InputArray::_InputArray(const std::vector<bool>& vec)
@@ -145,16 +145,16 @@ inline _InputArray::_InputArray(const double& val)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F + ACCESS_READ, &val, Size(1,1)); }
 
 inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
 
 inline _InputArray::_InputArray(const std::vector<cuda::GpuMat>& d_mat)
-{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
+{	init(+STD_VECTOR_CUDA_GPU_MAT + ACCESS_READ, &d_mat);}
 
 inline _InputArray::_InputArray(const ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_READ, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_READ, &buf); }
 
 inline _InputArray::_InputArray(const cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_READ, &cuda_mem); }
 
 template<typename _Tp> inline
 _InputArray _InputArray::rawIn(const std::vector<_Tp>& vec)
@@ -197,12 +197,12 @@ inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::
 
 ////////////////////////////////////////////////////////////////////////////////////////
 
-inline _OutputArray::_OutputArray() { init(NONE + ACCESS_WRITE, 0); }
+inline _OutputArray::_OutputArray() { init(+NONE + ACCESS_WRITE, 0); }
 inline _OutputArray::_OutputArray(int _flags, void* _obj) { init(_flags + ACCESS_WRITE, _obj); }
-inline _OutputArray::_OutputArray(Mat& m) { init(MAT+ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
-inline _OutputArray::_OutputArray(UMat& m) { init(UMAT + ACCESS_WRITE, &m); }
-inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(Mat& m) { init(+MAT+ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<Mat>& vec) { init(+STD_VECTOR_MAT + ACCESS_WRITE, &vec); }
+inline _OutputArray::_OutputArray(UMat& m) { init(+UMAT + ACCESS_WRITE, &m); }
+inline _OutputArray::_OutputArray(std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT + ACCESS_WRITE, &vec); }
 
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<_Tp>& vec)
@@ -214,7 +214,7 @@ _OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
 
 template<typename _Tp> inline
 _OutputArray::_OutputArray(std::vector<std::vector<_Tp> >& vec)
@@ -269,16 +269,16 @@ _OutputArray::_OutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, vec, Size(n, 1)); }
 
 inline _OutputArray::_OutputArray(cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_WRITE, &d_mat); }
 
 inline _OutputArray::_OutputArray(std::vector<cuda::GpuMat>& d_mat)
-{	init(STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
+{	init(+STD_VECTOR_CUDA_GPU_MAT + ACCESS_WRITE, &d_mat);}
 
 inline _OutputArray::_OutputArray(ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_WRITE, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_WRITE, &buf); }
 
 inline _OutputArray::_OutputArray(cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_WRITE, &cuda_mem); }
 
 inline _OutputArray::_OutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_WRITE, &m); }
@@ -325,10 +325,10 @@ _OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
 
 inline _InputOutputArray::_InputOutputArray() { init(0+ACCESS_RW, 0); }
 inline _InputOutputArray::_InputOutputArray(int _flags, void* _obj) { init(_flags+ACCESS_RW, _obj); }
-inline _InputOutputArray::_InputOutputArray(Mat& m) { init(MAT+ACCESS_RW, &m); }
-inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(STD_VECTOR_MAT+ACCESS_RW, &vec); }
-inline _InputOutputArray::_InputOutputArray(UMat& m) { init(UMAT+ACCESS_RW, &m); }
-inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(STD_VECTOR_UMAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(Mat& m) { init(+MAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<Mat>& vec) { init(+STD_VECTOR_MAT+ACCESS_RW, &vec); }
+inline _InputOutputArray::_InputOutputArray(UMat& m) { init(+UMAT+ACCESS_RW, &m); }
+inline _InputOutputArray::_InputOutputArray(std::vector<UMat>& vec) { init(+STD_VECTOR_UMAT+ACCESS_RW, &vec); }
 
 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
@@ -340,7 +340,7 @@ _InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
 
 template<std::size_t _Nm> inline
 _InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
-{ init(STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
+{ init(+STD_ARRAY_MAT + ACCESS_RW, arr.data(), Size(1, _Nm)); }
 
 template<typename _Tp> inline
 _InputOutputArray::_InputOutputArray(std::vector<std::vector<_Tp> >& vec)
@@ -395,13 +395,13 @@ _InputOutputArray::_InputOutputArray(const _Tp* vec, int n)
 { init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, vec, Size(n, 1)); }
 
 inline _InputOutputArray::_InputOutputArray(cuda::GpuMat& d_mat)
-{ init(CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
+{ init(+CUDA_GPU_MAT + ACCESS_RW, &d_mat); }
 
 inline _InputOutputArray::_InputOutputArray(ogl::Buffer& buf)
-{ init(OPENGL_BUFFER + ACCESS_RW, &buf); }
+{ init(+OPENGL_BUFFER + ACCESS_RW, &buf); }
 
 inline _InputOutputArray::_InputOutputArray(cuda::HostMem& cuda_mem)
-{ init(CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
+{ init(+CUDA_HOST_MEM + ACCESS_RW, &cuda_mem); }
 
 inline _InputOutputArray::_InputOutputArray(const Mat& m)
 { init(FIXED_TYPE + FIXED_SIZE + MAT + ACCESS_RW, &m); }
@@ -460,7 +460,7 @@ CV__DEBUG_NS_END
 
 template<typename _Tp> inline
 Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
       cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if(vec.empty())
@@ -497,7 +497,7 @@ Mat::Mat(const std::initializer_list<int> sizes, const std::initializer_list<_Tp
 
 template<typename _Tp, std::size_t _Nm> inline
 Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)arr.size()),
       cols(1), data(0), datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if(arr.empty())
@@ -514,7 +514,7 @@ Mat::Mat(const std::array<_Tp, _Nm>& arr, bool copyData)
 
 template<typename _Tp, int n> inline
 Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(n), cols(1), data(0),
       datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if( !copyData )
@@ -530,7 +530,7 @@ Mat::Mat(const Vec<_Tp, n>& vec, bool copyData)
 
 template<typename _Tp, int m, int n> inline
 Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(m), cols(n), data(0),
       datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if( !copyData )
@@ -546,7 +546,7 @@ Mat::Mat(const Matx<_Tp,m,n>& M, bool copyData)
 
 template<typename _Tp> inline
 Mat::Mat(const Point_<_Tp>& pt, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(2), cols(1), data(0),
       datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if( !copyData )
@@ -565,7 +565,7 @@ Mat::Mat(const Point_<_Tp>& pt, bool copyData)
 
 template<typename _Tp> inline
 Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows(3), cols(1), data(0),
       datastart(0), dataend(0), datalimit(0), allocator(0), u(0), size(&rows), step(0)
 {
     if( !copyData )
@@ -585,7 +585,7 @@ Mat::Mat(const Point3_<_Tp>& pt, bool copyData)
 
 template<typename _Tp> inline
 Mat::Mat(const MatCommaInitializer_<_Tp>& commaInitializer)
-    : flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
+    : flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(0), rows(0), cols(0), data(0),
       datastart(0), dataend(0), allocator(0), u(0), size(&rows)
 {
     *this = commaInitializer.operator Mat_<_Tp>();
@@ -2090,7 +2090,7 @@ SparseMatConstIterator_<_Tp> SparseMat::end() const
 template<typename _Tp> inline
 SparseMat_<_Tp>::SparseMat_()
 {
-    flags = MAGIC_VAL + traits::Type<_Tp>::value;
+    flags = +MAGIC_VAL + traits::Type<_Tp>::value;
 }
 
 template<typename _Tp> inline
@@ -3248,7 +3248,7 @@ const Mat_<_Tp>& operator /= (const Mat_<_Tp>& a, const MatExpr& b)
 
 template<typename _Tp> inline
 UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
-: flags(MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
+: flags(+MAGIC_VAL + traits::Type<_Tp>::value + CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
 cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
 {
     if(vec.empty())
diff --git a/3rdParty/opencv2/core/matx.hpp b/3rdParty/opencv2/core/matx.hpp
index 5b3bef6938..6216035aa9 100644
--- a/3rdParty/opencv2/core/matx.hpp
+++ b/3rdParty/opencv2/core/matx.hpp
@@ -61,8 +61,6 @@ namespace cv
 //! @addtogroup core_basic
 //! @{
 
-////////////////////////////// Small Matrix ///////////////////////////
-
 //! @cond IGNORED
 // FIXIT Remove this (especially CV_EXPORTS modifier)
 struct CV_EXPORTS Matx_AddOp { Matx_AddOp() {} Matx_AddOp(const Matx_AddOp&) {} };
@@ -74,6 +72,8 @@ struct CV_EXPORTS Matx_MatMulOp { Matx_MatMulOp() {} Matx_MatMulOp(const Matx_Ma
 struct CV_EXPORTS Matx_TOp { Matx_TOp() {} Matx_TOp(const Matx_TOp&) {} };
 //! @endcond
 
+////////////////////////////// Small Matrix ///////////////////////////
+
 /** @brief Template class for small matrices whose type and size are known at compilation time
 
 If you need a more flexible type, use Mat . The elements of the matrix M are accessible using the
@@ -215,7 +215,7 @@ template<typename _Tp, int m, int n> class Matx
     template<int l> Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp);
     Matx(const Matx<_Tp, n, m>& a, Matx_TOp);
 
-    _Tp val[m*n]; //< matrix elements
+    _Tp val[m*n]; ///< matrix elements
 };
 
 typedef Matx<float, 1, 2> Matx12f;
@@ -256,56 +256,83 @@ typedef Matx<double, 4, 4> Matx44d;
 typedef Matx<float, 6, 6> Matx66f;
 typedef Matx<double, 6, 6> Matx66d;
 
-/*!
-  traits
-*/
-template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
-{
-public:
-    typedef Matx<_Tp, m, n>                               value_type;
-    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
-    typedef _Tp                                           channel_type;
-    typedef value_type                                    vec_type;
-
-    enum { generic_type = 0,
-           channels     = m * n,
-           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
-#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
-           ,depth        = DataType<channel_type>::depth
-           ,type         = CV_MAKETYPE(depth, channels)
-#endif
-         };
-};
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a);
 
-namespace traits {
-template<typename _Tp, int m, int n>
-struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
-template<typename _Tp, int m, int n>
-struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
-} // namespace
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a);
 
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M);
 
-/** @brief  Comma-separated Matrix Initializer
-*/
-template<typename _Tp, int m, int n> class MatxCommaInitializer
-{
-public:
-    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
-    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
-    Matx<_Tp, m, n> operator *() const;
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType);
 
-    Matx<_Tp, m, n>* dst;
-    int idx;
-};
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
 
-/*
- Utility methods
-*/
-template<typename _Tp, int m> static double determinant(const Matx<_Tp, m, m>& a);
-template<typename _Tp, int m, int n> static double trace(const Matx<_Tp, m, n>& a);
-template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M);
-template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M, int normType);
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha);
 
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha);
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a);
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b);
 
 
 /////////////////////// Vec (used as element of multi-channel images /////////////////////
@@ -376,10 +403,8 @@ template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
     static Vec randn(_Tp a, _Tp b);
     static Vec randu(_Tp a, _Tp b);
     static Vec zeros();
-#ifdef CV_CXX11
     static Vec diag(_Tp alpha) = delete;
     static Vec eye() = delete;
-#endif
 
     //! per-element multiplication
     Vec mul(const Vec<_Tp, cn>& v) const;
@@ -402,9 +427,7 @@ template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
     const _Tp& operator ()(int i) const;
     _Tp& operator ()(int i);
 
-#ifdef CV_CXX11
     Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
-#endif
 
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
     Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
@@ -443,1086 +466,79 @@ typedef Vec<double, 4> Vec4d;
 typedef Vec<double, 6> Vec6d;
 /** @} */
 
-/*!
-  traits
-*/
-template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
-{
-public:
-    typedef Vec<_Tp, cn>                               value_type;
-    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
-    typedef _Tp                                        channel_type;
-    typedef value_type                                 vec_type;
-
-    enum { generic_type = 0,
-           channels     = cn,
-           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
-#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
-           depth        = DataType<channel_type>::depth,
-           type         = CV_MAKETYPE(depth, channels),
-#endif
-           _dummy_enum_finalizer = 0
-         };
-};
-
-namespace traits {
-template<typename _Tp, int cn>
-struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
-template<typename _Tp, int cn>
-struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
-} // namespace
-
-
-/** @brief  Comma-separated Vec Initializer
-*/
-template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
-{
-public:
-    VecCommaInitializer(Vec<_Tp, m>* _vec);
-    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
-    Vec<_Tp, m> operator *() const;
-};
-
-template<typename _Tp, int cn> static Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
-
-//! @} core_basic
-
-//! @cond IGNORED
-
-///////////////////////////////////// helper classes /////////////////////////////////////
-namespace internal
-{
-
-template<typename _Tp, int m> struct Matx_DetOp
-{
-    double operator ()(const Matx<_Tp, m, m>& a) const
-    {
-        Matx<_Tp, m, m> temp = a;
-        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
-        if( p == 0 )
-            return p;
-        for( int i = 0; i < m; i++ )
-            p *= temp(i, i);
-        return p;
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 1>
-{
-    double operator ()(const Matx<_Tp, 1, 1>& a) const
-    {
-        return a(0,0);
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 2>
-{
-    double operator ()(const Matx<_Tp, 2, 2>& a) const
-    {
-        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
-    }
-};
-
-template<typename _Tp> struct Matx_DetOp<_Tp, 3>
-{
-    double operator ()(const Matx<_Tp, 3, 3>& a) const
-    {
-        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
-            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
-            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
-    }
-};
-
-template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
-{
-    return Vec<_Tp, 2>(v[0], -v[1]);
-}
-
-template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
-{
-    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
-}
-
-} // internal
-
-
-
-////////////////////////////////// Matx Implementation ///////////////////////////////////
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx()
-{
-    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0)
-{
-    val[0] = v0;
-    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
-{
-    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
-    val[0] = v0; val[1] = v1;
-    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
-{
-    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2;
-    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
-{
-    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
-{
-    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
-    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
-{
-    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5;
-    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
-{
-    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6;
-    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
-{
-    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
-{
-    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8;
-    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
-{
-    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9;
-    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
-}
-
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
-{
-    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
-{
-    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    val[12] = v12; val[13] = v13;
-    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
-}
-
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
-{
-    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
-    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
-    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
-    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
-    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
-    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(const _Tp* values)
-{
-    for( int i = 0; i < channels; i++ ) val[i] = values[i];
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
-{
-    CV_DbgAssert(list.size() == channels);
-    int i = 0;
-    for(const auto& elem : list)
-    {
-        val[i++] = elem;
-    }
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
-{
-    Matx<_Tp, m, n> M;
-    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
-    return M;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
-{
-    return all(0);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
-{
-    return all(1);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
-{
-    Matx<_Tp,m,n> M;
-    for(int i = 0; i < shortdim; i++)
-        M(i,i) = 1;
-    return M;
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
-{
-    _Tp s = 0;
-    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
-{
-    double s = 0;
-    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
-{
-    Matx<_Tp,m,n> M;
-    for(int i = 0; i < shortdim; i++)
-        M(i,i) = d(i, 0);
-    return M;
-}
-
-template<typename _Tp, int m, int n> template<typename T2>
-inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
-{
-    Matx<T2, m, n> M;
-    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
-    return M;
-}
-
-template<typename _Tp, int m, int n> template<int m1, int n1> inline
-Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
-{
-    CV_StaticAssert(m1*n1 == m*n, "Input and destnarion matrices must have the same number of elements");
-    return (const Matx<_Tp, m1, n1>&)*this;
-}
-
-template<typename _Tp, int m, int n>
-template<int m1, int n1> inline
-Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
-{
-    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
-    Matx<_Tp, m1, n1> s;
-    for( int di = 0; di < m1; di++ )
-        for( int dj = 0; dj < n1; dj++ )
-            s(di, dj) = (*this)(base_row+di, base_col+dj);
-    return s;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
-{
-    CV_DbgAssert((unsigned)i < (unsigned)m);
-    return Matx<_Tp, 1, n>(&val[i*n]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
-{
-    CV_DbgAssert((unsigned)j < (unsigned)n);
-    Matx<_Tp, m, 1> v;
-    for( int i = 0; i < m; i++ )
-        v.val[i] = val[i*n + j];
-    return v;
-}
-
-template<typename _Tp, int m, int n> inline
-typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
-{
-    diag_type d;
-    for( int i = 0; i < shortdim; i++ )
-        d.val[i] = val[i*n + i];
-    return d;
-}
-
-template<typename _Tp, int m, int n> inline
-const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
-{
-    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
-    return this->val[row_idx*n + col_idx];
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
-{
-    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
-    return val[row_idx*n + col_idx];
-}
-
-template<typename _Tp, int m, int n> inline
-const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
-{
-    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
-    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
-    return val[i];
-}
-
-template<typename _Tp, int m, int n> inline
-_Tp& Matx<_Tp, m, n>::operator ()(int i)
-{
-    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
-    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
-    return val[i];
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
-}
-
-template<typename _Tp, int m, int n> template<typename _T2> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
-{
-    for( int i = 0; i < channels; i++ )
-        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
-}
-
-template<typename _Tp, int m, int n> template<int l> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
-{
-    for( int i = 0; i < m; i++ )
-        for( int j = 0; j < n; j++ )
-        {
-            _Tp s = 0;
-            for( int k = 0; k < l; k++ )
-                s += a(i, k) * b(k, j);
-            val[i*n + j] = s;
-        }
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
-{
-    for( int i = 0; i < m; i++ )
-        for( int j = 0; j < n; j++ )
-            val[i*n + j] = a(j, i);
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
-{
-    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
-{
-    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
-{
-    return Matx<_Tp, n, m>(*this, Matx_TOp());
-}
-
-template<typename _Tp, int m, int n> inline
-Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
-{
-    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
-    return (Vec<_Tp, n>&)(x);
-}
-
-template<typename _Tp, int m> static inline
-double determinant(const Matx<_Tp, m, m>& a)
-{
-    return cv::internal::Matx_DetOp<_Tp, m>()(a);
-}
-
-template<typename _Tp, int m, int n> static inline
-double trace(const Matx<_Tp, m, n>& a)
-{
-    _Tp s = 0;
-    for( int i = 0; i < std::min(m, n); i++ )
-        s += a(i,i);
-    return s;
-}
-
-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M)
-{
-    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
-}
-
-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M, int normType)
-{
-    switch(normType) {
-    case NORM_INF:
-        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    case NORM_L1:
-        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    case NORM_L2SQR:
-        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
-    default:
-    case NORM_L2:
-        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
-    }
-}
-
-
-
-//////////////////////////////// matx comma initializer //////////////////////////////////
-
-template<typename _Tp, typename _T2, int m, int n> static inline
-MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
-{
-    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
-    return (commaInitializer, val);
-}
-
-template<typename _Tp, int m, int n> inline
-MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
-    : dst(_mtx), idx(0)
-{}
-
-template<typename _Tp, int m, int n> template<typename _T2> inline
-MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
-{
-    CV_DbgAssert( idx < m*n );
-    dst->val[idx++] = saturate_cast<_Tp>(value);
-    return *this;
-}
-
-template<typename _Tp, int m, int n> inline
-Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
-{
-    CV_DbgAssert( idx == n*m );
-    return *dst;
-}
-
-
-
-/////////////////////////////////// Vec Implementation ///////////////////////////////////
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec() {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0)
-    : Matx<_Tp, cn, 1>(v0) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
-    : Matx<_Tp, cn, 1>(v0, v1) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
-    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
-    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const _Tp* values)
-    : Matx<_Tp, cn, 1>(values) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
-    : Matx<_Tp, cn, 1>(list) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
-    : Matx<_Tp, cn, 1>(m.val) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
-    : Matx<_Tp, cn, 1>(a, b, op) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
-    : Matx<_Tp, cn, 1>(a, b, op) {}
-
-template<typename _Tp, int cn> template<typename _T2> inline
-Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
-    : Matx<_Tp, cn, 1>(a, alpha, op) {}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
-{
-    Vec v;
-    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
-    return v;
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::ones()
-{
-    return Vec::all(1);
-}
-
 template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
-{
-    return Vec::all(0);
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
-{
-    Vec<_Tp, cn> w;
-    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
-    return w;
-}
-
-template<> inline
-Vec<float, 2> Vec<float, 2>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<double, 2> Vec<double, 2>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<float, 4> Vec<float, 4>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<> inline
-Vec<double, 4> Vec<double, 4>::conj() const
-{
-    return cv::internal::conjugate(*this);
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
-{
-    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
-    return Vec<_Tp, cn>();
-}
-
-template<> inline
-Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
-{
-    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
-                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
-                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
-}
-
-template<> inline
-Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
-{
-    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
-                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
-                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
-}
-
-template<typename _Tp, int cn> template<typename T2> inline
-Vec<_Tp, cn>::operator Vec<T2, cn>() const
-{
-    Vec<T2, cn> v;
-    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
-    return v;
-}
-
-template<typename _Tp, int cn> inline
-const _Tp& Vec<_Tp, cn>::operator [](int i) const
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-_Tp& Vec<_Tp, cn>::operator [](int i)
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-const _Tp& Vec<_Tp, cn>::operator ()(int i) const
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-_Tp& Vec<_Tp, cn>::operator ()(int i)
-{
-    CV_DbgAssert( (unsigned)i < (unsigned)cn );
-    return this->val[i];
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
-{
-    double nv = norm(v);
-    return v * (nv ? 1./nv : 0.);
-}
-
-
-
-//////////////////////////////// vec comma initializer //////////////////////////////////
-
-
-template<typename _Tp, typename _T2, int cn> static inline
-VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
-{
-    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
-    return (commaInitializer, val);
-}
-
-template<typename _Tp, int cn> inline
-VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
-    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
-{}
-
-template<typename _Tp, int cn> template<typename _T2> inline
-VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
-{
-    CV_DbgAssert( this->idx < cn );
-    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
-    return *this;
-}
-
-template<typename _Tp, int cn> inline
-Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
-{
-    CV_DbgAssert( this->idx == cn );
-    return *this->dst;
-}
-
-//! @endcond
-
-///////////////////////////// Matx out-of-class operators ////////////////////////////////
-
-//! @relates cv::Matx
-//! @{
-
-template<typename _Tp1, typename _Tp2, int m, int n> static inline
-Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
-    return a;
-}
-
-template<typename _Tp1, typename _Tp2, int m, int n> static inline
-Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = a.val[i] / alpha;
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
-{
-    for( int i = 0; i < m*n; i++ )
-        a.val[i] = a.val[i] / alpha;
-    return a;
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
-{
-    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
-{
-    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
-{
-    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
-}
-
-template<typename _Tp, int m, int n, int l> static inline
-Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
-{
-    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
-}
-
-template<typename _Tp, int m, int n> static inline
-Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
-{
-    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
-    return (const Vec<_Tp, m>&)(c);
-}
-
-template<typename _Tp, int m, int n> static inline
-bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    for( int i = 0; i < m*n; i++ )
-        if( a.val[i] != b.val[i] ) return false;
-    return true;
-}
-
-template<typename _Tp, int m, int n> static inline
-bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
-{
-    return !(a == b);
-}
-
-//! @}
-
-////////////////////////////// Vec out-of-class operators ////////////////////////////////
-
-//! @relates cv::Vec
-//! @{
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v);
 
 template<typename _Tp1, typename _Tp2, int cn> static inline
-Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
-{
-    for( int i = 0; i < cn; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
-    return a;
-}
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
 
 template<typename _Tp1, typename _Tp2, int cn> static inline
-Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
-{
-    for( int i = 0; i < cn; i++ )
-        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
-    return a;
-}
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
-{
-    return Vec<_Tp, cn>(a, b, Matx_AddOp());
-}
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
-{
-    return Vec<_Tp, cn>(a, b, Matx_SubOp());
-}
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
-{
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*alpha);
-    return a;
-}
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
-{
-    double ialpha = 1./alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
-{
-    float ialpha = 1.f/alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
-{
-    double ialpha = 1./alpha;
-    for( int i = 0; i < cn; i++ )
-        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
-    return a;
-}
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
-{
-    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
-{
-    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
-{
-    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
-{
-    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
-}
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha);
 
 template<typename _Tp, int cn> static inline
-Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
-{
-    Vec<_Tp,cn> t;
-    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
-    return t;
-}
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a);
 
-template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
-{
-    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
-                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
-                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
-                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
-}
+template<typename _Tp> inline
+Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
 
-template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
-{
-    v1 = v1 * v2;
-    return v1;
-}
+template<typename _Tp> inline
+Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2);
 
-//! @}
+//! @} core_basic
 
 } // cv
 
+#include "opencv2/core/matx.inl.hpp"
+
 #endif // OPENCV_CORE_MATX_HPP
diff --git a/3rdParty/opencv2/core/matx.inl.hpp b/3rdParty/opencv2/core/matx.inl.hpp
new file mode 100644
index 0000000000..faa3e749d6
--- /dev/null
+++ b/3rdParty/opencv2/core/matx.inl.hpp
@@ -0,0 +1,1115 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_MATX_INL_HPP
+#define OPENCV_CORE_MATX_INL_HPP
+
+#ifndef __cplusplus
+#  error matx.inl.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/matx.hpp"
+
+namespace cv
+{
+
+//==============================================================================
+// Helpers
+
+namespace internal
+{
+
+template<typename _Tp, int m> struct Matx_DetOp
+{
+    double operator ()(const Matx<_Tp, m, m>& a) const
+    {
+        Matx<_Tp, m, m> temp = a;
+        double p = LU(temp.val, m*sizeof(_Tp), m, 0, 0, 0);
+        if( p == 0 )
+            return p;
+        for( int i = 0; i < m; i++ )
+            p *= temp(i, i);
+        return p;
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 1>
+{
+    double operator ()(const Matx<_Tp, 1, 1>& a) const
+    {
+        return a(0,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 2>
+{
+    double operator ()(const Matx<_Tp, 2, 2>& a) const
+    {
+        return a(0,0)*a(1,1) - a(0,1)*a(1,0);
+    }
+};
+
+template<typename _Tp> struct Matx_DetOp<_Tp, 3>
+{
+    double operator ()(const Matx<_Tp, 3, 3>& a) const
+    {
+        return a(0,0)*(a(1,1)*a(2,2) - a(2,1)*a(1,2)) -
+            a(0,1)*(a(1,0)*a(2,2) - a(2,0)*a(1,2)) +
+            a(0,2)*(a(1,0)*a(2,1) - a(2,0)*a(1,1));
+    }
+};
+
+template<typename _Tp> Vec<_Tp, 2> inline conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> inline conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}
+
+} // internal::
+
+
+//==============================================================================
+// Matx
+
+template<typename _Tp, int m, int n> class DataType< Matx<_Tp, m, n> >
+{
+public:
+    typedef Matx<_Tp, m, n>                               value_type;
+    typedef Matx<typename DataType<_Tp>::work_type, m, n> work_type;
+    typedef _Tp                                           channel_type;
+    typedef value_type                                    vec_type;
+
+    enum { generic_type = 0,
+           channels     = m * n,
+           fmt          = traits::SafeFmt<channel_type>::fmt + ((channels - 1) << 8)
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           ,depth        = DataType<channel_type>::depth
+           ,type         = CV_MAKETYPE(depth, channels)
+#endif
+         };
+};
+
+
+namespace traits {
+template<typename _Tp, int m, int n>
+struct Depth< Matx<_Tp, m, n> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int m, int n>
+struct Type< Matx<_Tp, m, n> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, n*m) }; };
+} // namespace
+
+
+//! @brief  Comma-separated Matrix Initializer
+template<typename _Tp, int m, int n> class MatxCommaInitializer
+{
+public:
+    MatxCommaInitializer(Matx<_Tp, m, n>* _mtx);
+    template<typename T2> MatxCommaInitializer<_Tp, m, n>& operator , (T2 val);
+    Matx<_Tp, m, n> operator *() const;
+
+    Matx<_Tp, m, n>* dst;
+    int idx;
+};
+
+template<typename _Tp, typename _T2, int m, int n> static inline
+MatxCommaInitializer<_Tp, m, n> operator << (const Matx<_Tp, m, n>& mtx, _T2 val)
+{
+    MatxCommaInitializer<_Tp, m, n> commaInitializer((Matx<_Tp, m, n>*)&mtx);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int m, int n> inline
+MatxCommaInitializer<_Tp, m, n>::MatxCommaInitializer(Matx<_Tp, m, n>* _mtx)
+    : dst(_mtx), idx(0)
+{}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+MatxCommaInitializer<_Tp, m, n>& MatxCommaInitializer<_Tp, m, n>::operator , (_T2 value)
+{
+    CV_DbgAssert( idx < m*n );
+    dst->val[idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> MatxCommaInitializer<_Tp, m, n>::operator *() const
+{
+    CV_DbgAssert( idx == n*m );
+    return *dst;
+}
+
+////////////////////////////////// Matx Implementation ///////////////////////////////////
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx()
+{
+    for(int i = 0; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0)
+{
+    val[0] = v0;
+    for(int i = 1; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1)
+{
+    CV_StaticAssert(channels >= 2, "Matx should have at least 2 elements.");
+    val[0] = v0; val[1] = v1;
+    for(int i = 2; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2)
+{
+    CV_StaticAssert(channels >= 3, "Matx should have at least 3 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2;
+    for(int i = 3; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+{
+    CV_StaticAssert(channels >= 4, "Matx should have at least 4 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    for(int i = 4; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+{
+    CV_StaticAssert(channels >= 5, "Matx should have at least 5 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3; val[4] = v4;
+    for(int i = 5; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+{
+    CV_StaticAssert(channels >= 6, "Matx should have at least 6 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5;
+    for(int i = 6; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+{
+    CV_StaticAssert(channels >= 7, "Matx should have at least 7 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6;
+    for(int i = 7; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+{
+    CV_StaticAssert(channels >= 8, "Matx should have at least 8 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    for(int i = 8; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+{
+    CV_StaticAssert(channels >= 9, "Matx should have at least 9 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8;
+    for(int i = 9; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+{
+    CV_StaticAssert(channels >= 10, "Matx should have at least 10 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9;
+    for(int i = 10; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11)
+{
+    CV_StaticAssert(channels >= 12, "Matx should have at least 12 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    for(int i = 12; i < channels; i++) val[i] = _Tp(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+{
+    CV_StaticAssert(channels >= 14, "Matx should have at least 14 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13;
+    for (int i = 14; i < channels; i++) val[i] = _Tp(0);
+}
+
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13, _Tp v14, _Tp v15)
+{
+    CV_StaticAssert(channels >= 16, "Matx should have at least 16 elements.");
+    val[0] = v0; val[1] = v1; val[2] = v2; val[3] = v3;
+    val[4] = v4; val[5] = v5; val[6] = v6; val[7] = v7;
+    val[8] = v8; val[9] = v9; val[10] = v10; val[11] = v11;
+    val[12] = v12; val[13] = v13; val[14] = v14; val[15] = v15;
+    for(int i = 16; i < channels; i++) val[i] = _Tp(0);
+}
+
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(const _Tp* values)
+{
+    for( int i = 0; i < channels; i++ ) val[i] = values[i];
+}
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n>::Matx(std::initializer_list<_Tp> list)
+{
+    CV_DbgAssert(list.size() == channels);
+    int i = 0;
+    for(const auto& elem : list)
+    {
+        val[i++] = elem;
+    }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::all(_Tp alpha)
+{
+    Matx<_Tp, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = alpha;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::zeros()
+{
+    return all(0);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::ones()
+{
+    return all(1);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::eye()
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = 1;
+    return M;
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp Matx<_Tp, m, n>::dot(const Matx<_Tp, m, n>& M) const
+{
+    _Tp s = 0;
+    for( int i = 0; i < channels; i++ ) s += val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+double Matx<_Tp, m, n>::ddot(const Matx<_Tp, m, n>& M) const
+{
+    double s = 0;
+    for( int i = 0; i < channels; i++ ) s += (double)val[i]*M.val[i];
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n> Matx<_Tp,m,n>::diag(const typename Matx<_Tp,m,n>::diag_type& d)
+{
+    Matx<_Tp,m,n> M;
+    for(int i = 0; i < shortdim; i++)
+        M(i,i) = d(i, 0);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<typename T2>
+inline Matx<_Tp, m, n>::operator Matx<T2, m, n>() const
+{
+    Matx<T2, m, n> M;
+    for( int i = 0; i < m*n; i++ ) M.val[i] = saturate_cast<T2>(val[i]);
+    return M;
+}
+
+template<typename _Tp, int m, int n> template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::reshape() const
+{
+    CV_StaticAssert(m1*n1 == m*n, "Input and destination matrices must have the same number of elements");
+    return (const Matx<_Tp, m1, n1>&)*this;
+}
+
+template<typename _Tp, int m, int n>
+template<int m1, int n1> inline
+Matx<_Tp, m1, n1> Matx<_Tp, m, n>::get_minor(int base_row, int base_col) const
+{
+    CV_DbgAssert(0 <= base_row && base_row+m1 <= m && 0 <= base_col && base_col+n1 <= n);
+    Matx<_Tp, m1, n1> s;
+    for( int di = 0; di < m1; di++ )
+        for( int dj = 0; dj < n1; dj++ )
+            s(di, dj) = (*this)(base_row+di, base_col+dj);
+    return s;
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, 1, n> Matx<_Tp, m, n>::row(int i) const
+{
+    CV_DbgAssert((unsigned)i < (unsigned)m);
+    return Matx<_Tp, 1, n>(&val[i*n]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, 1> Matx<_Tp, m, n>::col(int j) const
+{
+    CV_DbgAssert((unsigned)j < (unsigned)n);
+    Matx<_Tp, m, 1> v;
+    for( int i = 0; i < m; i++ )
+        v.val[i] = val[i*n + j];
+    return v;
+}
+
+template<typename _Tp, int m, int n> inline
+typename Matx<_Tp, m, n>::diag_type Matx<_Tp, m, n>::diag() const
+{
+    diag_type d;
+    for( int i = 0; i < shortdim; i++ )
+        d.val[i] = val[i*n + i];
+    return d;
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator()(int row_idx, int col_idx) const
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return this->val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int row_idx, int col_idx)
+{
+    CV_DbgAssert( (unsigned)row_idx < (unsigned)m && (unsigned)col_idx < (unsigned)n );
+    return val[row_idx*n + col_idx];
+}
+
+template<typename _Tp, int m, int n> inline
+const _Tp& Matx<_Tp, m, n>::operator ()(int i) const
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+_Tp& Matx<_Tp, m, n>::operator ()(int i)
+{
+    CV_StaticAssert(m == 1 || n == 1, "Single index indexation requires matrix to be a column or a row");
+    CV_DbgAssert( (unsigned)i < (unsigned)(m+n-1) );
+    return val[i];
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_AddOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] + b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_SubOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] - b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<typename _T2> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, _T2 alpha, Matx_ScaleOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_MulOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] * b.val[i]);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b, Matx_DivOp)
+{
+    for( int i = 0; i < channels; i++ )
+        val[i] = saturate_cast<_Tp>(a.val[i] / b.val[i]);
+}
+
+template<typename _Tp, int m, int n> template<int l> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b, Matx_MatMulOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+        {
+            _Tp s = 0;
+            for( int k = 0; k < l; k++ )
+                s += a(i, k) * b(k, j);
+            val[i*n + j] = s;
+        }
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp,m,n>::Matx(const Matx<_Tp, n, m>& a, Matx_TOp)
+{
+    for( int i = 0; i < m; i++ )
+        for( int j = 0; j < n; j++ )
+            val[i*n + j] = a(j, i);
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::mul(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_MulOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, m, n> Matx<_Tp, m, n>::div(const Matx<_Tp, m, n>& a) const
+{
+    return Matx<_Tp, m, n>(*this, a, Matx_DivOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Matx<_Tp, n, m> Matx<_Tp, m, n>::t() const
+{
+    return Matx<_Tp, n, m>(*this, Matx_TOp());
+}
+
+template<typename _Tp, int m, int n> inline
+Vec<_Tp, n> Matx<_Tp, m, n>::solve(const Vec<_Tp, m>& rhs, int method) const
+{
+    Matx<_Tp, n, 1> x = solve((const Matx<_Tp, m, 1>&)(rhs), method);
+    return (Vec<_Tp, n>&)(x);
+}
+
+template<typename _Tp, int m> static inline
+double determinant(const Matx<_Tp, m, m>& a)
+{
+    return cv::internal::Matx_DetOp<_Tp, m>()(a);
+}
+
+template<typename _Tp, int m, int n> static inline
+double trace(const Matx<_Tp, m, n>& a)
+{
+    _Tp s = 0;
+    for( int i = 0; i < std::min(m, n); i++ )
+        s += a(i,i);
+    return s;
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M)
+{
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
+}
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType)
+{
+    switch(normType) {
+    case NORM_INF:
+        return (double)normInf<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L1:
+        return (double)normL1<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    case NORM_L2SQR:
+        return (double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n);
+    default:
+    case NORM_L2:
+        return std::sqrt((double)normL2Sqr<_Tp, typename DataType<_Tp>::work_type>(M.val, m*n));
+    }
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator += (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int m, int n> static inline
+Matx<_Tp1, m, n>& operator -= (Matx<_Tp1, m, n>& a, const Matx<_Tp2, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator + (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, int alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator *= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = saturate_cast<_Tp>(a.val[i] * alpha);
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, int alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (int alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (float alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
+{
+    for( int i = 0; i < m*n; i++ )
+        a.val[i] = a.val[i] / alpha;
+    return a;
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
+{
+    return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
+{
+    return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
+{
+    return Matx<_Tp, m, n>(a, -1, Matx_ScaleOp());
+}
+
+template<typename _Tp, int m, int n, int l> static inline
+Matx<_Tp, m, n> operator * (const Matx<_Tp, m, l>& a, const Matx<_Tp, l, n>& b)
+{
+    return Matx<_Tp, m, n>(a, b, Matx_MatMulOp());
+}
+
+template<typename _Tp, int m, int n> static inline
+Vec<_Tp, m> operator * (const Matx<_Tp, m, n>& a, const Vec<_Tp, n>& b)
+{
+    Matx<_Tp, m, 1> c(a, b, Matx_MatMulOp());
+    return (const Vec<_Tp, m>&)(c);
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator == (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    for( int i = 0; i < m*n; i++ )
+        if( a.val[i] != b.val[i] ) return false;
+    return true;
+}
+
+template<typename _Tp, int m, int n> static inline
+bool operator != (const Matx<_Tp, m, n>& a, const Matx<_Tp, m, n>& b)
+{
+    return !(a == b);
+}
+
+//==============================================================================
+// Vec
+
+template<typename _Tp, int cn> class DataType< Vec<_Tp, cn> >
+{
+public:
+    typedef Vec<_Tp, cn>                               value_type;
+    typedef Vec<typename DataType<_Tp>::work_type, cn> work_type;
+    typedef _Tp                                        channel_type;
+    typedef value_type                                 vec_type;
+
+    enum { generic_type = 0,
+           channels     = cn,
+           fmt          = DataType<channel_type>::fmt + ((channels - 1) << 8),
+#ifdef OPENCV_TRAITS_ENABLE_DEPRECATED
+           depth        = DataType<channel_type>::depth,
+           type         = CV_MAKETYPE(depth, channels),
+#endif
+           _dummy_enum_finalizer = 0
+         };
+};
+
+namespace traits {
+template<typename _Tp, int cn>
+struct Depth< Vec<_Tp, cn> > { enum { value = Depth<_Tp>::value }; };
+template<typename _Tp, int cn>
+struct Type< Vec<_Tp, cn> > { enum { value = CV_MAKETYPE(Depth<_Tp>::value, cn) }; };
+} // namespace
+
+/** @brief  Comma-separated Vec Initializer
+*/
+template<typename _Tp, int m> class VecCommaInitializer : public MatxCommaInitializer<_Tp, m, 1>
+{
+public:
+    VecCommaInitializer(Vec<_Tp, m>* _vec);
+    template<typename T2> VecCommaInitializer<_Tp, m>& operator , (T2 val);
+    Vec<_Tp, m> operator *() const;
+};
+
+template<typename _Tp, typename _T2, int cn> static inline
+VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
+{
+    VecCommaInitializer<_Tp, cn> commaInitializer((Vec<_Tp, cn>*)&vec);
+    return (commaInitializer, val);
+}
+
+template<typename _Tp, int cn> inline
+VecCommaInitializer<_Tp, cn>::VecCommaInitializer(Vec<_Tp, cn>* _vec)
+    : MatxCommaInitializer<_Tp, cn, 1>(_vec)
+{}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+VecCommaInitializer<_Tp, cn>& VecCommaInitializer<_Tp, cn>::operator , (_T2 value)
+{
+    CV_DbgAssert( this->idx < cn );
+    this->dst->val[this->idx++] = saturate_cast<_Tp>(value);
+    return *this;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> VecCommaInitializer<_Tp, cn>::operator *() const
+{
+    CV_DbgAssert( this->idx == cn );
+    return *this->dst;
+}
+
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec() {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0)
+    : Matx<_Tp, cn, 1>(v0) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1)
+    : Matx<_Tp, cn, 1>(v0, v1) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2)
+    : Matx<_Tp, cn, 1>(v0, v1, v2) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(_Tp v0, _Tp v1, _Tp v2, _Tp v3, _Tp v4, _Tp v5, _Tp v6, _Tp v7, _Tp v8, _Tp v9, _Tp v10, _Tp v11, _Tp v12, _Tp v13)
+    : Matx<_Tp, cn, 1>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const _Tp* values)
+    : Matx<_Tp, cn, 1>(values) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(std::initializer_list<_Tp> list)
+    : Matx<_Tp, cn, 1>(list) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Vec<_Tp, cn>& m)
+    : Matx<_Tp, cn, 1>(m.val) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp op)
+    : Matx<_Tp, cn, 1>(a, b, op) {}
+
+template<typename _Tp, int cn> template<typename _T2> inline
+Vec<_Tp, cn>::Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp op)
+    : Matx<_Tp, cn, 1>(a, alpha, op) {}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::all(_Tp alpha)
+{
+    Vec v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = alpha;
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::ones()
+{
+    return Vec::all(1);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::zeros()
+{
+    return Vec::all(0);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_Tp, cn>& v) const
+{
+    Vec<_Tp, cn> w;
+    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
+    return w;
+}
+
+template<> inline
+Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<> inline
+Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return cv::internal::conjugate(*this);
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>&) const
+{
+    CV_StaticAssert(cn == 3, "for arbitrary-size vector there is no cross-product defined");
+    return Vec<_Tp, cn>();
+}
+
+template<> inline
+Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
+{
+    return Vec<float,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<> inline
+Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v) const
+{
+    return Vec<double,3>(this->val[1]*v.val[2] - this->val[2]*v.val[1],
+                     this->val[2]*v.val[0] - this->val[0]*v.val[2],
+                     this->val[0]*v.val[1] - this->val[1]*v.val[0]);
+}
+
+template<typename _Tp, int cn> template<typename T2> inline
+Vec<_Tp, cn>::operator Vec<T2, cn>() const
+{
+    Vec<T2, cn> v;
+    for( int i = 0; i < cn; i++ ) v.val[i] = saturate_cast<T2>(this->val[i]);
+    return v;
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator [](int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator [](int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+const _Tp& Vec<_Tp, cn>::operator ()(int i) const
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+_Tp& Vec<_Tp, cn>::operator ()(int i)
+{
+    CV_DbgAssert( (unsigned)i < (unsigned)cn );
+    return this->val[i];
+}
+
+template<typename _Tp, int cn> inline
+Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
+{
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator += (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] + b.val[i]);
+    return a;
+}
+
+template<typename _Tp1, typename _Tp2, int cn> static inline
+Vec<_Tp1, cn>& operator -= (Vec<_Tp1, cn>& a, const Vec<_Tp2, cn>& b)
+{
+    for( int i = 0; i < cn; i++ )
+        a.val[i] = saturate_cast<_Tp1>(a.val[i] - b.val[i]);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator + (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_AddOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a, const Vec<_Tp, cn>& b)
+{
+    return Vec<_Tp, cn>(a, b, Matx_SubOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, int alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, float alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
+{
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*alpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (int alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (float alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator * (double alpha, const Vec<_Tp, cn>& a)
+{
+    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn> operator - (const Vec<_Tp, cn>& a)
+{
+    Vec<_Tp,cn> t;
+    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
+    return t;
+}
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
+
+} // cv::
+
+#endif // OPENCV_CORE_MATX_INL_HPP
diff --git a/3rdParty/opencv2/core/ocl.hpp b/3rdParty/opencv2/core/ocl.hpp
index bb243937b9..ab561691d8 100644
--- a/3rdParty/opencv2/core/ocl.hpp
+++ b/3rdParty/opencv2/core/ocl.hpp
@@ -127,6 +127,11 @@ class CV_EXPORTS_W_SIMPLE Device
     CV_WRAP int singleFPConfig() const;
     CV_WRAP int halfFPConfig() const;
 
+    /// true if 'cl_khr_fp64' extension is available
+    CV_WRAP bool hasFP64() const;
+    /// true if 'cl_khr_fp16' extension is available
+    CV_WRAP bool hasFP16() const;
+
     CV_WRAP bool endianLittle() const;
     CV_WRAP bool errorCorrectionSupport() const;
 
@@ -697,7 +702,8 @@ class CV_EXPORTS PlatformInfo
     Impl* p;
 };
 
-CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS CV_DEPRECATED const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf, size_t buf_size);
 CV_EXPORTS const char* typeToStr(int t);
 CV_EXPORTS const char* memopTypeToStr(int t);
 CV_EXPORTS const char* vecopTypeToStr(int t);
@@ -778,7 +784,7 @@ class CV_EXPORTS Timer
     void start();
     void stop();
 
-    uint64 durationNS() const; //< duration in nanoseconds
+    uint64 durationNS() const; ///< duration in nanoseconds
 
 protected:
     struct Impl;
diff --git a/3rdParty/opencv2/core/opencl/opencl_info.hpp b/3rdParty/opencv2/core/opencl/opencl_info.hpp
index 3baf301fcd..06c5f9b94d 100644
--- a/3rdParty/opencv2/core/opencl/opencl_info.hpp
+++ b/3rdParty/opencv2/core/opencl/opencl_info.hpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include <iostream>
+#include <sstream>
 
 #include <opencv2/core.hpp>
 #include <opencv2/core/ocl.hpp>
@@ -140,13 +141,13 @@ static void dumpOpenCLInformation()
         DUMP_MESSAGE_STDOUT("    Max memory allocation size = " << maxMemAllocSizeStr);
         DUMP_CONFIG_PROPERTY("cv_ocl_current_maxMemAllocSize", device.maxMemAllocSize());
 
-        const char* doubleSupportStr = device.doubleFPConfig() > 0 ? "Yes" : "No";
+        const char* doubleSupportStr = device.hasFP64() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Double support = " << doubleSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.doubleFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveDoubleSupport", device.hasFP64());
 
-        const char* halfSupportStr = device.halfFPConfig() > 0 ? "Yes" : "No";
+        const char* halfSupportStr = device.hasFP16() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Half support = " << halfSupportStr);
-        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.halfFPConfig() > 0);
+        DUMP_CONFIG_PROPERTY("cv_ocl_current_haveHalfSupport", device.hasFP16());
 
         const char* isUnifiedMemoryStr = device.hostUnifiedMemory() ? "Yes" : "No";
         DUMP_MESSAGE_STDOUT("    Host unified memory = " << isUnifiedMemoryStr);
diff --git a/3rdParty/opencv2/core/opengl.hpp b/3rdParty/opencv2/core/opengl.hpp
index 1902fc9bae..72387bd824 100644
--- a/3rdParty/opencv2/core/opengl.hpp
+++ b/3rdParty/opencv2/core/opengl.hpp
@@ -57,7 +57,7 @@ This section describes OpenGL interoperability.
 
 To enable OpenGL support, configure OpenCV using CMake with WITH_OPENGL=ON . Currently OpenGL is
 supported only with WIN32, GTK and Qt backends on Windows and Linux (MacOS and Android are not
-supported). For GTK backend gtkglext-1.0 library is required.
+supported). For GTK-2.0 backend gtkglext-1.0 library is required.
 
 To use OpenGL functionality you should first create OpenGL context (window or frame buffer). You can
 do this with namedWindow function or with other OpenGL toolkit (GLUT, for example).
@@ -703,10 +703,18 @@ cv::ogl::Texture2D::Format cv::ogl::Texture2D::format() const
 
 ///////
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
 inline
 cv::ogl::Arrays::Arrays() : size_(0)
 {
 }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
 inline
 int cv::ogl::Arrays::size() const
diff --git a/3rdParty/opencv2/core/operations.hpp b/3rdParty/opencv2/core/operations.hpp
index 56f7ffe40c..f754b6f01b 100644
--- a/3rdParty/opencv2/core/operations.hpp
+++ b/3rdParty/opencv2/core/operations.hpp
@@ -61,10 +61,10 @@
 #  define CV_FORMAT_PRINTF(A, B)
 #endif
 
-//! @cond IGNORED
-
 namespace cv
 {
+//! @cond IGNORED
+
 
 ////////////////////////////// Matx methods depending on core API /////////////////////////////
 
@@ -424,26 +424,6 @@ template<typename _Tp> static inline _Tp randu()
   return (_Tp)theRNG();
 }
 
-///////////////////////////////// Formatted string generation /////////////////////////////////
-
-/** @brief Returns a text string formatted using the printf-like expression.
-
-The function acts like sprintf but forms and returns an STL string. It can be used to form an error
-message in the Exception constructor.
-@param fmt printf-compatible formatting specifiers.
-
-**Note**:
-|Type|Specifier|
-|-|-|
-|`const char*`|`%s`|
-|`char`|`%c`|
-|`float` / `double`|`%f`,`%g`|
-|`int`, `long`, `long long`|`%d`, `%ld`, ``%lld`|
-|`unsigned`, `unsigned long`, `unsigned long long`|`%u`, `%lu`, `%llu`|
-|`uint64` -> `uintmax_t`, `int64` -> `intmax_t`|`%ju`, `%jd`|
-|`size_t`|`%zu`|
- */
-CV_EXPORTS String format( const char* fmt, ... ) CV_FORMAT_PRINTF(1, 2);
 
 ///////////////////////////////// Formatted output of cv::Mat /////////////////////////////////
 
@@ -496,6 +476,28 @@ int print(const Matx<_Tp, m, n>& matx, FILE* stream = stdout)
 
 //! @endcond
 
+///////////////////////////////// Formatted string generation /////////////////////////////////
+
+/** @brief Returns a text string formatted using the printf-like expression.
+
+The function acts like sprintf but forms and returns an STL string. It can be used to form an error
+message in the Exception constructor.
+@param fmt printf-compatible formatting specifiers.
+
+**Note**:
+|Type|Specifier|
+|-|-|
+|`const char*`|`%s`|
+|`char`|`%c`|
+|`float` / `double`|`%f`,`%g`|
+|`int`, `long`, `long long`|`%d`, `%ld`, ``%lld`|
+|`unsigned`, `unsigned long`, `unsigned long long`|`%u`, `%lu`, `%llu`|
+|`uint64` -> `uintmax_t`, `int64` -> `intmax_t`|`%ju`, `%jd`|
+|`size_t`|`%zu`|
+@ingroup core_utils
+ */
+CV_EXPORTS String format(const char* fmt, ...) CV_FORMAT_PRINTF(1, 2);
+
 /****************************************************************************************\
 *                                  Auxiliary algorithms                                  *
 \****************************************************************************************/
@@ -506,7 +508,7 @@ The generic function partition implements an \f$O(N^2)\f$ algorithm for splittin
 into one or more equivalency classes, as described in
 <http://en.wikipedia.org/wiki/Disjoint-set_data_structure> . The function returns the number of
 equivalency classes.
-@param _vec Set of elements stored as a vector.
+@param vec Set of elements stored as a vector.
 @param labels Output vector of labels. It contains as many elements as vec. Each label labels[i] is
 a 0-based cluster index of `vec[i]`.
 @param predicate Equivalence predicate (pointer to a boolean function of two arguments or an
@@ -516,11 +518,11 @@ may or may not be in the same class.
 @ingroup core_cluster
 */
 template<typename _Tp, class _EqPredicate> int
-partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
+partition( const std::vector<_Tp>& vec, std::vector<int>& labels,
           _EqPredicate predicate=_EqPredicate())
 {
-    int i, j, N = (int)_vec.size();
-    const _Tp* vec = &_vec[0];
+    int i, j, N = (int)vec.size();
+    const _Tp* _vec = &vec[0];
 
     const int PARENT=0;
     const int RANK=1;
@@ -546,7 +548,7 @@ partition( const std::vector<_Tp>& _vec, std::vector<int>& labels,
 
         for( j = 0; j < N; j++ )
         {
-            if( i == j || !predicate(vec[i], vec[j]))
+            if( i == j || !predicate(_vec[i], _vec[j]))
                 continue;
             int root2 = j;
 
diff --git a/3rdParty/opencv2/core/optim.hpp b/3rdParty/opencv2/core/optim.hpp
index 6770cc1b9e..6dfb59e81e 100644
--- a/3rdParty/opencv2/core/optim.hpp
+++ b/3rdParty/opencv2/core/optim.hpp
@@ -256,6 +256,7 @@ class CV_EXPORTS ConjGradSolver : public MinProblemSolver
 //! return codes for cv::solveLP() function
 enum SolveLPResult
 {
+    SOLVELP_LOST   = -3, //!< problem is feasible, but solver lost solution due to floating-point arithmetic errors
     SOLVELP_UNBOUNDED    = -2, //!< problem is unbounded (target function can achieve arbitrary high values)
     SOLVELP_UNFEASIBLE    = -1, //!< problem is unfeasible (there are no points that satisfy all the constraints imposed)
     SOLVELP_SINGLE    = 0, //!< there is only one maximum for target function
@@ -291,8 +292,12 @@ in the latter case it is understood to correspond to \f$c^T\f$.
 and the remaining to \f$A\f$. It should contain 32- or 64-bit floating point numbers.
 @param z The solution will be returned here as a column-vector - it corresponds to \f$c\f$ in the
 formulation above. It will contain 64-bit floating point numbers.
+@param constr_eps allowed numeric disparity for constraints
 @return One of cv::SolveLPResult
  */
+CV_EXPORTS_W int solveLP(InputArray Func, InputArray Constr, OutputArray z, double constr_eps);
+
+/** @overload */
 CV_EXPORTS_W int solveLP(InputArray Func, InputArray Constr, OutputArray z);
 
 //! @}
diff --git a/3rdParty/opencv2/core/persistence.hpp b/3rdParty/opencv2/core/persistence.hpp
index dbd1225989..cac3755aae 100644
--- a/3rdParty/opencv2/core/persistence.hpp
+++ b/3rdParty/opencv2/core/persistence.hpp
@@ -53,50 +53,6 @@
 #  error persistence.hpp header must be compiled as C++
 #endif
 
-//! @addtogroup core_c
-//! @{
-
-/** @brief "black box" representation of the file storage associated with a file on disk.
-
-Several functions that are described below take CvFileStorage\* as inputs and allow the user to
-save or to load hierarchical collections that consist of scalar values, standard CXCore objects
-(such as matrices, sequences, graphs), and user-defined objects.
-
-OpenCV can read and write data in XML (<http://www.w3c.org/XML>), YAML (<http://www.yaml.org>) or
-JSON (<http://www.json.org/>) formats. Below is an example of 3x3 floating-point identity matrix A,
-stored in XML and YAML files
-using CXCore functions:
-XML:
-@code{.xml}
-    <?xml version="1.0">
-    <opencv_storage>
-    <A type_id="opencv-matrix">
-      <rows>3</rows>
-      <cols>3</cols>
-      <dt>f</dt>
-      <data>1. 0. 0. 0. 1. 0. 0. 0. 1.</data>
-    </A>
-    </opencv_storage>
-@endcode
-YAML:
-@code{.yaml}
-    %YAML:1.0
-    A: !!opencv-matrix
-      rows: 3
-      cols: 3
-      dt: f
-      data: [ 1., 0., 0., 0., 1., 0., 0., 0., 1.]
-@endcode
-As it can be seen from the examples, XML uses nested tags to represent hierarchy, while YAML uses
-indentation for that purpose (similar to the Python programming language).
-
-The same functions can read and write data in both formats; the particular format is determined by
-the extension of the opened file, ".xml" for XML files, ".yml" or ".yaml" for YAML and ".json" for
-JSON.
- */
-
-//! @} core_c
-
 #include "opencv2/core/types.hpp"
 #include "opencv2/core/mat.hpp"
 
@@ -283,13 +239,14 @@ element is a structure of 2 integers, followed by a single-precision floating-po
 equivalent notations of the above specification are `iif`, `2i1f` and so forth. Other examples: `u`
 means that the array consists of bytes, and `2d` means the array consists of pairs of doubles.
 
-@see @ref samples/cpp/filestorage.cpp
+@see @ref samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
 */
 
 //! @{
 
-/** @example samples/cpp/filestorage.cpp
+/** @example samples/cpp/tutorial_code/core/file_input_output/file_input_output.cpp
 A complete example using the FileStorage interface
+Check @ref tutorial_file_input_output_with_xml_yml "the corresponding tutorial" for more details
 */
 
 ////////////////////////// XML & YAML I/O //////////////////////////
@@ -322,10 +279,10 @@ class CV_EXPORTS_W FileStorage
     };
     enum State
     {
-        UNDEFINED      = 0,
-        VALUE_EXPECTED = 1,
-        NAME_EXPECTED  = 2,
-        INSIDE_MAP     = 4
+        UNDEFINED      = 0,  //!< Initial or uninitialized state.
+        VALUE_EXPECTED = 1,  //!< Expecting a value in the current position.
+        NAME_EXPECTED  = 2,  //!< Expecting a key/name in the current position.
+        INSIDE_MAP     = 4   //!< Indicates being inside a map (a set of key-value pairs).
     };
 
     /** @brief The constructors.
@@ -408,6 +365,8 @@ class CV_EXPORTS_W FileStorage
      */
     CV_WRAP void write(const String& name, int val);
     /// @overload
+    CV_WRAP void write(const String& name, int64_t val);
+    /// @overload
     CV_WRAP void write(const String& name, double val);
     /// @overload
     CV_WRAP void write(const String& name, const String& val);
@@ -573,6 +532,8 @@ class CV_EXPORTS_W_SIMPLE FileNode
     CV_WRAP size_t rawSize() const;
     //! returns the node content as an integer. If the node stores floating-point number, it is rounded.
     operator int() const;
+    //! returns the node content as a signed 64bit integer. If the node stores floating-point number, it is rounded.
+    operator int64_t() const;
     //! returns the node content as float
     operator float() const;
     //! returns the node content as double
@@ -696,10 +657,8 @@ class CV_EXPORTS FileNodeIterator
 
 /////////////////// XML & YAML I/O implementation //////////////////
 
-//! @relates cv::FileStorage
-//! @{
-
 CV_EXPORTS void write( FileStorage& fs, const String& name, int value );
+CV_EXPORTS void write( FileStorage& fs, const String& name, int64_t value );
 CV_EXPORTS void write( FileStorage& fs, const String& name, float value );
 CV_EXPORTS void write( FileStorage& fs, const String& name, double value );
 CV_EXPORTS void write( FileStorage& fs, const String& name, const String& value );
@@ -711,16 +670,13 @@ CV_EXPORTS void write( FileStorage& fs, const String& name, const std::vector<DM
 #endif
 
 CV_EXPORTS void writeScalar( FileStorage& fs, int value );
+CV_EXPORTS void writeScalar( FileStorage& fs, int64_t value );
 CV_EXPORTS void writeScalar( FileStorage& fs, float value );
 CV_EXPORTS void writeScalar( FileStorage& fs, double value );
 CV_EXPORTS void writeScalar( FileStorage& fs, const String& value );
 
-//! @}
-
-//! @relates cv::FileNode
-//! @{
-
 CV_EXPORTS void read(const FileNode& node, int& value, int default_value);
+CV_EXPORTS void read(const FileNode& node, int64_t& value, int64_t default_value);
 CV_EXPORTS void read(const FileNode& node, float& value, float default_value);
 CV_EXPORTS void read(const FileNode& node, double& value, double default_value);
 CV_EXPORTS void read(const FileNode& node, std::string& value, const std::string& default_value);
@@ -796,10 +752,7 @@ static inline void read(const FileNode& node, Range& value, const Range& default
     value.start = temp.x; value.end = temp.y;
 }
 
-//! @}
-
 /** @brief Writes string to a file storage.
-@relates cv::FileStorage
  */
 CV_EXPORTS FileStorage& operator << (FileStorage& fs, const String& str);
 
@@ -884,9 +837,6 @@ namespace internal
 
 //! @endcond
 
-//! @relates cv::FileStorage
-//! @{
-
 template<typename _Tp> static inline
 void write(FileStorage& fs, const _Tp& value)
 {
@@ -1118,10 +1068,6 @@ static inline void write(FileStorage& fs, const std::vector<DMatch>& vec)
 }
 #endif
 
-//! @} FileStorage
-
-//! @relates cv::FileNode
-//! @{
 
 static inline
 void read(const FileNode& node, bool& value, bool default_value)
@@ -1208,11 +1154,6 @@ void read( const FileNode& node, std::vector<DMatch>& vec, const std::vector<DMa
         read(node, vec);
 }
 
-//! @} FileNode
-
-//! @relates cv::FileStorage
-//! @{
-
 /** @brief Writes data to a file storage.
  */
 template<typename _Tp> static inline
@@ -1244,11 +1185,6 @@ FileStorage& operator << (FileStorage& fs, char* value)
     return (fs << String(value));
 }
 
-//! @} FileStorage
-
-//! @relates cv::FileNodeIterator
-//! @{
-
 /** @brief Reads data from a file storage.
  */
 template<typename _Tp> static inline
@@ -1268,11 +1204,6 @@ FileNodeIterator& operator >> (FileNodeIterator& it, std::vector<_Tp>& vec)
     return it;
 }
 
-//! @} FileNodeIterator
-
-//! @relates cv::FileNode
-//! @{
-
 /** @brief Reads data from a file storage.
  */
 template<typename _Tp> static inline
@@ -1323,11 +1254,6 @@ void operator >> (const FileNode& n, DMatch& m)
     it >> m.queryIdx >> m.trainIdx >> m.imgIdx >> m.distance;
 }
 
-//! @} FileNode
-
-//! @relates cv::FileNodeIterator
-//! @{
-
 CV_EXPORTS bool operator == (const FileNodeIterator& it1, const FileNodeIterator& it2);
 CV_EXPORTS bool operator != (const FileNodeIterator& it1, const FileNodeIterator& it2);
 
@@ -1343,8 +1269,6 @@ bool operator < (const FileNodeIterator& it1, const FileNodeIterator& it2)
     return it1.remaining() > it2.remaining();
 }
 
-//! @} FileNodeIterator
-
 } // cv
 
 #endif // OPENCV_CORE_PERSISTENCE_HPP
diff --git a/3rdParty/opencv2/core/quaternion.hpp b/3rdParty/opencv2/core/quaternion.hpp
index ce0165a48a..ab96ce2bae 100644
--- a/3rdParty/opencv2/core/quaternion.hpp
+++ b/3rdParty/opencv2/core/quaternion.hpp
@@ -31,7 +31,7 @@
 #include <iostream>
 namespace cv
 {
-//! @addtogroup core
+//! @addtogroup core_quaternion
 //! @{
 
 //! Unit quaternion flag
@@ -77,9 +77,9 @@ class QuatEnum
      * For intrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =X(\theta_1) Y(\theta_2) Z(\theta_3) \f]
      * For extrinsic rotations in the order of X-Y-Z, the rotation matrix R can be calculated by:\f[R =Z({\theta_3}) Y({\theta_2}) X({\theta_1})\f]
      * where
-     * \f[X({\theta})={\begin{bmatrix}1&0&0\\0&\cos {\theta_1} &-\sin {\theta_1} \\0&\sin {\theta_1} &\cos {\theta_1} \\\end{bmatrix}},
-     * Y({\theta})={\begin{bmatrix}\cos \theta_{2}&0&\sin \theta_{2}\\0&1 &0 \\\ -sin \theta_2& 0&\cos \theta_{2} \\\end{bmatrix}},
-     * Z({\theta})={\begin{bmatrix}\cos\theta_{3} &-\sin \theta_3&0\\\sin \theta_3 &\cos \theta_3 &0\\0&0&1\\\end{bmatrix}}.
+     * \f[X({\theta_1})={\begin{bmatrix}1&0&0\\0&\cos {\theta_1} &-\sin {\theta_1} \\0&\sin {\theta_1} &\cos {\theta_1} \\\end{bmatrix}},
+     * Y({\theta_2})={\begin{bmatrix}\cos \theta_{2}&0&\sin \theta_{2}\\0&1 &0 \\\ -sin \theta_2& 0&\cos \theta_{2} \\\end{bmatrix}},
+     * Z({\theta_3})={\begin{bmatrix}\cos\theta_{3} &-\sin \theta_3&0\\\sin \theta_3 &\cos \theta_3 &0\\0&0&1\\\end{bmatrix}}.
      * \f]
      *
      * The function is designed according to this set of conventions:
diff --git a/3rdParty/opencv2/core/quaternion.inl.hpp b/3rdParty/opencv2/core/quaternion.inl.hpp
index 0c3a5e1264..9d5a6d86f1 100644
--- a/3rdParty/opencv2/core/quaternion.inl.hpp
+++ b/3rdParty/opencv2/core/quaternion.inl.hpp
@@ -28,7 +28,7 @@
 #define OPENCV_CORE_QUATERNION_INL_HPP
 
 #ifndef OPENCV_CORE_QUATERNION_HPP
-#erorr This is not a standalone header. Include quaternion.hpp instead.
+#error This is not a standalone header. Include quaternion.hpp instead.
 #endif
 
 //@cond IGNORE
@@ -745,8 +745,8 @@ Quat<T> Quat<T>::lerp(const Quat<T> &q0, const Quat<T> &q1, const T t)
 template <typename T>
 Quat<T> Quat<T>::slerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssumeType assumeUnit, bool directChange)
 {
-    Quatd v0(q0);
-    Quatd v1(q1);
+    Quat<T> v0(q0);
+    Quat<T> v1(q1);
     if (!assumeUnit)
     {
         v0 = v0.normalize();
@@ -754,7 +754,7 @@ Quat<T> Quat<T>::slerp(const Quat<T> &q0, const Quat<T> &q1, const T t, QuatAssu
     }
     T cosTheta = v0.dot(v1);
     constexpr T DOT_THRESHOLD = 0.995;
-    if (cosTheta > DOT_THRESHOLD)
+    if (std::abs(cosTheta) > DOT_THRESHOLD)
     {
         return nlerp(v0, v1, t, QUAT_ASSUME_UNIT);
     }
@@ -843,7 +843,7 @@ Quat<T> Quat<T>::interPoint(const Quat<T> &q0, const Quat<T> &q1,
 template <typename T>
 Quat<T> Quat<T>::spline(const Quat<T> &q0, const Quat<T> &q1, const Quat<T> &q2, const Quat<T> &q3, const T t, QuatAssumeType assumeUnit)
 {
-    Quatd v0(q0), v1(q1), v2(q2), v3(q3);
+    Quat<T> v0(q0), v1(q1), v2(q2), v3(q3);
     if (!assumeUnit)
     {
         v0 = v0.normalize();
diff --git a/3rdParty/opencv2/core/saturate.hpp b/3rdParty/opencv2/core/saturate.hpp
index 28ff2774b0..04951d42f2 100644
--- a/3rdParty/opencv2/core/saturate.hpp
+++ b/3rdParty/opencv2/core/saturate.hpp
@@ -46,6 +46,7 @@
 #define OPENCV_CORE_SATURATE_HPP
 
 #include "opencv2/core/cvdef.h"
+#include <climits>
 #include "opencv2/core/fast_math.hpp"
 
 namespace cv
@@ -157,20 +158,20 @@ template<> inline uint64 saturate_cast<uint64>(int64 v)      { return (uint64)st
 template<> inline int64 saturate_cast<int64>(uint64 v)       { return (int64)std::min(v, (uint64)LLONG_MAX); }
 
 /** @overload */
-template<typename _Tp> static inline _Tp saturate_cast(float16_t v) { return saturate_cast<_Tp>((float)v); }
+template<typename _Tp> static inline _Tp saturate_cast(hfloat v) { return saturate_cast<_Tp>((float)v); }
 
 // in theory, we could use a LUT for 8u/8s->16f conversion,
 // but with hardware support for FP32->FP16 conversion the current approach is preferable
-template<> inline float16_t saturate_cast<float16_t>(uchar v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(schar v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(ushort v)  { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(short v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(unsigned v){ return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(int v)     { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(uint64 v)  { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(int64 v)   { return float16_t((float)v); }
-template<> inline float16_t saturate_cast<float16_t>(float v)   { return float16_t(v); }
-template<> inline float16_t saturate_cast<float16_t>(double v)  { return float16_t((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(uchar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(schar v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(ushort v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(short v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(unsigned v){ return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int v)     { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(uint64 v)  { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(int64 v)   { return hfloat((float)v); }
+template<> inline hfloat saturate_cast<hfloat>(float v)   { return hfloat(v); }
+template<> inline hfloat saturate_cast<hfloat>(double v)  { return hfloat((float)v); }
 
 //! @}
 
diff --git a/3rdParty/opencv2/core/traits.hpp b/3rdParty/opencv2/core/traits.hpp
index e206528df0..d17dc4e508 100644
--- a/3rdParty/opencv2/core/traits.hpp
+++ b/3rdParty/opencv2/core/traits.hpp
@@ -261,10 +261,10 @@ template<> class DataType<double>
          };
 };
 
-template<> class DataType<float16_t>
+template<> class DataType<hfloat>
 {
 public:
-    typedef float16_t   value_type;
+    typedef hfloat   value_type;
     typedef float       work_type;
     typedef value_type  channel_type;
     typedef value_type  vec_type;
@@ -347,7 +347,7 @@ template<> class TypeDepth<CV_64F>
 template<> class TypeDepth<CV_16F>
 {
     enum { depth = CV_16F };
-    typedef float16_t value_type;
+    typedef hfloat value_type;
 };
 
 #endif
diff --git a/3rdParty/opencv2/core/types.hpp b/3rdParty/opencv2/core/types.hpp
index e16f9d4ee8..fc9ed18820 100644
--- a/3rdParty/opencv2/core/types.hpp
+++ b/3rdParty/opencv2/core/types.hpp
@@ -57,6 +57,11 @@
 #include "opencv2/core/cvstd.hpp"
 #include "opencv2/core/matx.hpp"
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4459)  // declaration of '...' hides global declaration
+#endif
+
 namespace cv
 {
 
@@ -84,7 +89,7 @@ template<typename _Tp> class Complex
     //! conjugation
     Complex conj() const;
 
-    _Tp re, im; //< the real and the imaginary parts
+    _Tp re, im; ///< the real and the imaginary parts
 };
 
 typedef Complex<float> Complexf;
@@ -470,7 +475,14 @@ template<typename _Tp> class Rect_
     template<typename _Tp2> operator Rect_<_Tp2>() const;
 
     //! checks whether the rectangle contains the point
-    bool contains(const Point_<_Tp>& pt) const;
+    /*! @warning After OpenCV 4.11.0, when calling Rect.contains() with cv::Point2f / cv::Point2d point, point should not convert/round to int.
+     * ```
+     * Rect_<int> r(0,0,500,500); Point_<float> pt(250.0f, 499.9f);
+     * r.contains(pt) returns false.(OpenCV 4.10.0 or before)
+     * r.contains(pt) returns true. (OpenCV 4.11.0 or later)
+     * ```
+     */
+    template<typename _Tp2> inline bool contains(const Point_<_Tp2>& pt) const;
 
     _Tp x; //!< x coordinate of the top-left corner
     _Tp y; //!< y coordinate of the top-left corner
@@ -522,38 +534,44 @@ The sample below demonstrates how to use RotatedRect:
 
 @sa CamShift, fitEllipse, minAreaRect, CvBox2D
 */
-class CV_EXPORTS RotatedRect
+class CV_EXPORTS_W_SIMPLE RotatedRect
 {
 public:
     //! default constructor
-    RotatedRect();
+    CV_WRAP RotatedRect();
     /** full constructor
     @param center The rectangle mass center.
     @param size Width and height of the rectangle.
     @param angle The rotation angle in a clockwise direction. When the angle is 0, 90, 180, 270 etc.,
     the rectangle becomes an up-right rectangle.
     */
-    RotatedRect(const Point2f& center, const Size2f& size, float angle);
+    CV_WRAP RotatedRect(const Point2f& center, const Size2f& size, float angle);
     /**
     Any 3 end points of the RotatedRect. They must be given in order (either clockwise or
     anticlockwise).
      */
-    RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
+    CV_WRAP RotatedRect(const Point2f& point1, const Point2f& point2, const Point2f& point3);
 
-    /** returns 4 vertices of the rectangle
-    @param pts The points array for storing rectangle vertices. The order is bottomLeft, topLeft, topRight, bottomRight.
+    /** returns 4 vertices of the rotated rectangle
+    @param pts The points array for storing rectangle vertices. The order is _bottomLeft_, _topLeft_, topRight, bottomRight.
+    @note _Bottom_, _Top_, _Left_ and _Right_ sides refer to the original rectangle (angle is 0),
+    so after 180 degree rotation _bottomLeft_ point will be located at the top right corner of the
+    rectangle.
     */
     void points(Point2f pts[]) const;
+
+    CV_WRAP void points(CV_OUT std::vector<Point2f>& pts) const;
+
     //! returns the minimal up-right integer rectangle containing the rotated rectangle
-    Rect boundingRect() const;
+    CV_WRAP Rect boundingRect() const;
     //! returns the minimal (exact) floating point rectangle containing the rotated rectangle, not intended for use with images
-    Rect_<float> boundingRect2f() const;
+    CV_WRAP Rect2f boundingRect2f() const;
     //! returns the rectangle mass center
-    Point2f center;
+    CV_PROP_RW Point2f center;
     //! returns width and height of the rectangle
-    Size2f size;
+    CV_PROP_RW Size2f size;
     //! returns the rotation angle. When the angle is 0, 90, 180, 270 etc., the rectangle becomes an up-right rectangle.
-    float angle;
+    CV_PROP_RW float angle;
 };
 
 template<> class DataType< RotatedRect >
@@ -1850,12 +1868,29 @@ Rect_<_Tp>::operator Rect_<_Tp2>() const
     return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
 }
 
-template<typename _Tp> inline
-bool Rect_<_Tp>::contains(const Point_<_Tp>& pt) const
+template<typename _Tp> template<typename _Tp2> inline
+bool Rect_<_Tp>::contains(const Point_<_Tp2>& pt) const
 {
     return x <= pt.x && pt.x < x + width && y <= pt.y && pt.y < y + height;
 }
-
+// See https://github.com/opencv/opencv/issues/26016
+template<> template<> inline
+bool Rect_<int>::contains(const Point_<double>& pt) const
+{
+    // std::numeric_limits<int>::digits is 31.
+    // std::numeric_limits<double>::digits is 53.
+    // So conversion int->double does not lead to accuracy errors.
+    const Rect_<double> _rect(static_cast<double>(x), static_cast<double>(y), static_cast<double>(width), static_cast<double>(height));
+    return _rect.contains(pt);
+}
+template<> template<> inline
+bool Rect_<int>::contains(const Point_<float>& _pt) const
+{
+    // std::numeric_limits<float>::digits is 24.
+    // std::numeric_limits<double>::digits is 53.
+    // So conversion float->double does not lead to accuracy errors.
+    return contains(Point_<double>(static_cast<double>(_pt.x), static_cast<double>(_pt.y)));
+}
 
 template<typename _Tp> static inline
 Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Point_<_Tp>& b )
@@ -2017,6 +2052,15 @@ double jaccardDistance(const Rect_<_Tp>& a, const Rect_<_Tp>& b) {
     return 1.0 - Aab / (Aa + Ab - Aab);
 }
 
+/** @brief Finds out if there is any intersection between two rectangles
+ *
+ * mainly useful for language bindings
+ * @param a First rectangle
+ * @param b Second rectangle
+ * @return the area of the intersection
+ */
+CV_EXPORTS_W inline double rectangleIntersectionArea(const Rect2d& a, const Rect2d& b) { return (a & b).area(); }
+
 ////////////////////////////// RotatedRect //////////////////////////////
 
 inline
@@ -2436,4 +2480,8 @@ TermCriteria::TermCriteria(int _type, int _maxCount, double _epsilon)
 
 } // cv
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 #endif //OPENCV_CORE_TYPES_HPP
diff --git a/3rdParty/opencv2/core/types_c.h b/3rdParty/opencv2/core/types_c.h
index 6a2d0c0fa6..473062a53f 100644
--- a/3rdParty/opencv2/core/types_c.h
+++ b/3rdParty/opencv2/core/types_c.h
@@ -90,13 +90,7 @@
 #include <float.h>
 #endif // SKIP_INCLUDES
 
-#if defined _WIN32
-#  define CV_CDECL __cdecl
-#  define CV_STDCALL __stdcall
-#else
-#  define CV_CDECL
-#  define CV_STDCALL
-#endif
+
 
 #ifndef CV_DEFAULT
 #  ifdef __cplusplus
@@ -203,21 +197,13 @@ enum {
 *                             Common macros and inline functions                         *
 \****************************************************************************************/
 
-#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
-
-/** min & max without jumps */
-#define  CV_IMIN(a, b)  ((a) ^ (((a)^(b)) & (((a) < (b)) - 1)))
-
-#define  CV_IMAX(a, b)  ((a) ^ (((a)^(b)) & (((a) > (b)) - 1)))
-
 /** absolute value without jumps */
 #ifndef __cplusplus
 #  define  CV_IABS(a)     (((a) ^ ((a) < 0 ? -1 : 0)) - ((a) < 0 ? -1 : 0))
 #else
 #  define  CV_IABS(a)     abs(a)
 #endif
-#define  CV_CMP(a,b)    (((a) > (b)) - ((a) < (b)))
-#define  CV_SIGN(a)     CV_CMP((a),0)
+
 
 #define cvInvSqrt(value) ((float)(1./sqrt(value)))
 #define cvSqrt(value)  ((float)sqrt(value))
@@ -675,8 +661,6 @@ CV_INLINE int cvIplDepth( int type )
 #define CV_MATND_MAGIC_VAL    0x42430000
 #define CV_TYPE_NAME_MATND    "opencv-nd-matrix"
 
-#define CV_MAX_DIM            32
-
 #ifdef __cplusplus
 typedef struct CvMatND CvMatND;
 CV_EXPORTS CvMatND cvMatND(const cv::Mat& m);
diff --git a/3rdParty/opencv2/core/utility.hpp b/3rdParty/opencv2/core/utility.hpp
index e7395cb30c..985d20dcb4 100644
--- a/3rdParty/opencv2/core/utility.hpp
+++ b/3rdParty/opencv2/core/utility.hpp
@@ -176,14 +176,45 @@ extern "C" typedef int (*ErrorCallback)( int status, const char* func_name,
 */
 CV_EXPORTS ErrorCallback redirectError( ErrorCallback errCallback, void* userdata=0, void** prevUserdata=0);
 
+/** @brief Generates a unique temporary file name.
+
+This function generates a full, unique file path for a temporary file,
+which can be used to create temporary files for various purposes.
+
+@param suffix (optional) The desired file extension or suffix for the temporary file (e.g., ".png", ".txt").
+If no suffix is provided (suffix = 0), the file will not have a specific extension.
+
+@return cv::String A full unique path for the temporary file.
+
+@note
+- The function does not create the file, it only generates the name.
+- The file name is unique for the system session.
+- Works cross-platform (Windows, Linux, macOS).
+ */
 CV_EXPORTS String tempfile( const char* suffix = 0);
+
+/** @brief Searches for files matching the specified pattern in a directory.
+
+This function searches for files that match a given pattern (e.g., `*.jpg`)
+in the specified directory. The search can be limited to the directory itself
+or be recursive, including subdirectories.
+
+@param pattern The file search pattern, which can include wildcards like `*`
+(for matching multiple characters) or `?` (for matching a single character).
+
+@param result  Output vector where the file paths matching the search
+pattern will be stored.
+@param recursive (optional) Boolean flag indicating whether to search
+subdirectories recursively. If true, the search will include all subdirectories.
+The default value is `false`.
+ */
 CV_EXPORTS void glob(String pattern, std::vector<String>& result, bool recursive = false);
 
-/** @brief OpenCV will try to set the number of threads for the next parallel region.
+/** @brief OpenCV will try to set the number of threads for subsequent parallel regions.
 
-If threads == 0, OpenCV will disable threading optimizations and run all it's functions
-sequentially. Passing threads \< 0 will reset threads number to system default. This function must
-be called outside of parallel region.
+If threads == 1, OpenCV will disable threading optimizations and run all it's functions
+sequentially. Passing threads \< 0 will reset threads number to system default.
+The function is not thread-safe. It must not be called in parallel region or concurrent threads.
 
 OpenCV will try to run its functions with specified threads number, but some behaviour differs from
 framework:
@@ -309,11 +340,12 @@ class CV_EXPORTS_W TickMeter
     //! stops counting ticks.
     CV_WRAP void stop()
     {
-        int64 time = cv::getTickCount();
+        const int64 time = cv::getTickCount();
         if (startTime == 0)
             return;
         ++counter;
-        sumTime += (time - startTime);
+        lastTime = time - startTime;
+        sumTime += lastTime;
         startTime = 0;
     }
 
@@ -336,11 +368,35 @@ class CV_EXPORTS_W TickMeter
     }
 
     //! returns passed time in seconds.
-    CV_WRAP double getTimeSec()   const
+    CV_WRAP double getTimeSec() const
     {
         return (double)getTimeTicks() / getTickFrequency();
     }
 
+    //! returns counted ticks of the last iteration.
+    CV_WRAP int64 getLastTimeTicks() const
+    {
+        return lastTime;
+    }
+
+    //! returns passed time of the last iteration in microseconds.
+    CV_WRAP double getLastTimeMicro() const
+    {
+        return getLastTimeMilli()*1e3;
+    }
+
+    //! returns passed time of the last iteration in milliseconds.
+    CV_WRAP double getLastTimeMilli() const
+    {
+        return getLastTimeSec()*1e3;
+    }
+
+    //! returns passed time of the last iteration in seconds.
+    CV_WRAP double getLastTimeSec() const
+    {
+        return (double)getLastTimeTicks() / getTickFrequency();
+    }
+
     //! returns internal counter value.
     CV_WRAP int64 getCounter() const
     {
@@ -373,15 +429,17 @@ class CV_EXPORTS_W TickMeter
     //! resets internal values.
     CV_WRAP void reset()
     {
-        startTime = 0;
-        sumTime = 0;
         counter = 0;
+        sumTime = 0;
+        startTime = 0;
+        lastTime = 0;
     }
 
 private:
     int64 counter;
     int64 sumTime;
     int64 startTime;
+    int64 lastTime;
 };
 
 /** @brief output operator
@@ -544,6 +602,18 @@ bool isAligned(const void* p1, const void* p2, const void* p3, const void* p4)
     return isAligned<N>(((size_t)p1)|((size_t)p2)|((size_t)p3)|((size_t)p4));
 }
 
+/*! @brief Flags that allow to midify some functions behavior. Used as set of flags.
+*/
+enum AlgorithmHint {
+    ALGO_HINT_DEFAULT = 0, //!< Default algorithm behaviour defined during OpenCV build
+    ALGO_HINT_ACCURATE = 1, //!< Use generic portable implementation
+    ALGO_HINT_APPROX = 2, //!< Allow alternative approximations to get faster implementation. Behaviour and result depends on a platform
+};
+
+/*! @brief Returns AlgorithmHint defined during OpenCV compilation. Defines #ALGO_HINT_DEFAULT behavior.
+ */
+CV_EXPORTS_W AlgorithmHint getDefaultAlgorithmHint();
+
 /** @brief Enables or disables the optimized code.
 
 The function can be used to dynamically turn on and off optimized dispatched code (code that uses SSE4.2, AVX/AVX2,
@@ -773,7 +843,7 @@ The sample below demonstrates how to use CommandLineParser:
 The keys parameter is a string containing several blocks, each one is enclosed in curly braces and
 describes one argument. Each argument contains three parts separated by the `|` symbol:
 
--# argument names is a space-separated list of option synonyms (to mark argument as positional, prefix it with the `@` symbol)
+-# argument names is a list of option synonyms separated by standard space characters ' ' (to mark argument as positional, prefix it with the `@` symbol)
 -# default value will be used if the argument was not provided (can be empty)
 -# help message (can be empty)
 
@@ -796,6 +866,8 @@ For example:
 Note that there are no default values for `help` and `timestamp` so we can check their presence using the `has()` method.
 Arguments with default values are considered to be always present. Use the `get()` method in these cases to check their
 actual value instead.
+Note that whitespace characters other than standard spaces are considered part of the string.
+Additionally, leading and trailing standard spaces around the help messages are ignored.
 
 String keys like `get<String>("@image1")` return the empty string `""` by default - even with an empty default value.
 Use the special `<none>` default value to enforce that the returned string must not be empty. (like in `get<String>("@image2")`)
diff --git a/3rdParty/opencv2/core/utils/allocator_stats.impl.hpp b/3rdParty/opencv2/core/utils/allocator_stats.impl.hpp
index e2bb209626..99ceabc547 100644
--- a/3rdParty/opencv2/core/utils/allocator_stats.impl.hpp
+++ b/3rdParty/opencv2/core/utils/allocator_stats.impl.hpp
@@ -9,8 +9,6 @@
 
 //#define OPENCV_DISABLE_ALLOCATOR_STATS
 
-#ifdef CV_CXX11
-
 #include <atomic>
 
 #ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
@@ -26,14 +24,6 @@
 #define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
 #endif
 
-#else  // CV_CXX11
-
-#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
-#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
-#endif
-
-#endif  // CV_CXX11
-
 namespace cv { namespace utils {
 
 #ifdef CV__ALLOCATOR_STATS_LOG
@@ -59,7 +49,7 @@ class AllocatorStatistics : public AllocatorStatisticsInterface
     void onAllocate(size_t /*sz*/) {}
     void onFree(size_t /*sz*/) {}
 
-#elif defined(CV_CXX11)
+#else
 
 protected:
     typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
@@ -104,49 +94,7 @@ class AllocatorStatistics : public AllocatorStatisticsInterface
 #endif
         curr -= (counter_t)sz;
     }
-
-#else  // non C++11
-
-protected:
-    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
-    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
-public:
-    AllocatorStatistics()
-        : curr(0), total(0), total_allocs(0), peak(0)
-    {}
-    ~AllocatorStatistics() CV_OVERRIDE {}
-
-    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
-    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
-    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
-    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
-
-    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
-
-    // Controller interface
-    void onAllocate(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-
-        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
-
-        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
-
-        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
-        total += sz;
-
-        CV_XADD(&total_allocs, (counter_t)1);
-    }
-    void onFree(size_t sz)
-    {
-#ifdef CV__ALLOCATOR_STATS_LOG
-        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
-#endif
-        CV_XADD(&curr, (counter_t)-sz);
-    }
-#endif
+#endif // OPENCV_DISABLE_ALLOCATOR_STATS
 };
 
 #ifdef CV__ALLOCATOR_STATS_LOG
diff --git a/3rdParty/opencv2/core/utils/filesystem.hpp b/3rdParty/opencv2/core/utils/filesystem.hpp
index 54f691b4c4..93bc33aefd 100644
--- a/3rdParty/opencv2/core/utils/filesystem.hpp
+++ b/3rdParty/opencv2/core/utils/filesystem.hpp
@@ -62,7 +62,7 @@ CV_EXPORTS void glob_relative(const cv::String& directory, const cv::String& pat
 CV_EXPORTS bool createDirectory(const cv::String& path);
 CV_EXPORTS bool createDirectories(const cv::String& path);
 
-#ifdef __OPENCV_BUILD
+#if defined(__OPENCV_BUILD) || defined(BUILD_PLUGIN)
 // TODO
 //CV_EXPORTS cv::String getTempDirectory();
 
diff --git a/3rdParty/opencv2/core/utils/trace.hpp b/3rdParty/opencv2/core/utils/trace.hpp
index 3f3d372f3e..74d1256c9f 100644
--- a/3rdParty/opencv2/core/utils/trace.hpp
+++ b/3rdParty/opencv2/core/utils/trace.hpp
@@ -70,11 +70,11 @@ class CV_EXPORTS Region
     struct LocationExtraData;
     struct LocationStaticStorage
     {
-        LocationExtraData** ppExtra;   //< implementation specific data
-        const char* name;              //< region name (function name or other custom name)
-        const char* filename;          //< source code filename
-        int line;                      //< source code line
-        int flags;                     //< flags (implementation code path: Plain, IPP, OpenCL)
+        LocationExtraData** ppExtra;   ///< implementation specific data
+        const char* name;              ///< region name (function name or other custom name)
+        const char* filename;          ///< source code filename
+        int line;                      ///< source code line
+        int flags;                     ///< flags (implementation code path: Plain, IPP, OpenCL)
     };
 
     Region(const LocationStaticStorage& location);
@@ -100,18 +100,18 @@ class CV_EXPORTS Region
 
 //! Specify region flags
 enum RegionLocationFlag {
-    REGION_FLAG_FUNCTION = (1 << 0),             //< region is function (=1) / nested named region (=0)
-    REGION_FLAG_APP_CODE = (1 << 1),             //< region is Application code (=1) / OpenCV library code (=0)
-    REGION_FLAG_SKIP_NESTED = (1 << 2),          //< avoid processing of nested regions
+    REGION_FLAG_FUNCTION = (1 << 0),             ///< region is function (=1) / nested named region (=0)
+    REGION_FLAG_APP_CODE = (1 << 1),             ///< region is Application code (=1) / OpenCV library code (=0)
+    REGION_FLAG_SKIP_NESTED = (1 << 2),          ///< avoid processing of nested regions
 
-    REGION_FLAG_IMPL_IPP = (1 << 16),            //< region is part of IPP code path
-    REGION_FLAG_IMPL_OPENCL = (2 << 16),         //< region is part of OpenCL code path
-    REGION_FLAG_IMPL_OPENVX = (3 << 16),         //< region is part of OpenVX code path
+    REGION_FLAG_IMPL_IPP = (1 << 16),            ///< region is part of IPP code path
+    REGION_FLAG_IMPL_OPENCL = (2 << 16),         ///< region is part of OpenCL code path
+    REGION_FLAG_IMPL_OPENVX = (3 << 16),         ///< region is part of OpenVX code path
 
     REGION_FLAG_IMPL_MASK = (15 << 16),
 
     REGION_FLAG_REGION_FORCE = (1 << 30),
-    REGION_FLAG_REGION_NEXT = (1 << 31),         //< close previous region (see #CV_TRACE_REGION_NEXT macro)
+    REGION_FLAG_REGION_NEXT = (1 << 31),         ///< close previous region (see #CV_TRACE_REGION_NEXT macro)
 
     ENUM_REGION_FLAG_FORCE_INT = INT_MAX
 };
diff --git a/3rdParty/opencv2/core/version.hpp b/3rdParty/opencv2/core/version.hpp
index 727be984df..8a9621f68b 100644
--- a/3rdParty/opencv2/core/version.hpp
+++ b/3rdParty/opencv2/core/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_VERSION_HPP
 
 #define CV_VERSION_MAJOR    4
-#define CV_VERSION_MINOR    6
+#define CV_VERSION_MINOR    11
 #define CV_VERSION_REVISION 0
 #define CV_VERSION_STATUS   ""
 
diff --git a/3rdParty/opencv2/cvconfig.h b/3rdParty/opencv2/cvconfig.h
index 6efd5e856f..e1b1ee82bb 100644
--- a/3rdParty/opencv2/cvconfig.h
+++ b/3rdParty/opencv2/cvconfig.h
@@ -72,11 +72,14 @@
 #define HAVE_OPENJPEG
 /* #undef HAVE_JASPER */
 
+/* AVIF codec */
+/* #undef HAVE_AVIF */
+
 /* IJG JPEG codec */
 #define HAVE_JPEG
 
-/* libpng/png.h needs to be included */
-/* #undef HAVE_LIBPNG_PNG_H */
+/* JPEG XL codec */
+/* #undef HAVE_JPEGXL */
 
 /* GDCM DICOM codec */
 /* #undef HAVE_GDCM */
@@ -106,6 +109,9 @@
 /* PNG codec */
 #define HAVE_PNG
 
+/* PNG codec */
+/* #undef HAVE_SPNG */
+
 /* Posix threads (pthreads) */
 /* #undef HAVE_PTHREAD */
 
@@ -144,6 +150,6 @@
 #define OPENCV_TRACE
 
 /* Library QR-code decoding */
-#define HAVE_QUIRC
+/* #undef HAVE_QUIRC */
 
 #endif // OPENCV_CVCONFIG_H_INCLUDED
diff --git a/3rdParty/opencv2/dnn/all_layers.hpp b/3rdParty/opencv2/dnn/all_layers.hpp
index e3f0c3df97..0475bac230 100644
--- a/3rdParty/opencv2/dnn/all_layers.hpp
+++ b/3rdParty/opencv2/dnn/all_layers.hpp
@@ -241,6 +241,39 @@ CV__DNN_INLINE_NS_BEGIN
 
     };
 
+    /** @brief This function performs array summation based
+    * on the Einstein summation convention. The function
+    * allows for concise expressions of various mathematical
+    * operations using subscripts.
+    *
+    * By default, the labels are placed in alphabetical
+    * order at the end of the output.
+    * For example:
+    * if `c = einsum("i,j", a, b)`, then `c[i,j] == a[i]*b[j]`.
+    * However, if `c = einsum("j,i", a, b)`, then `c[i,j] = a[j]*b[i]`.
+    * Alternatively, you can control the output order or prevent
+    * an axis from being summed/force an axis to be summed
+    * by providing indices for the output.
+    * For example:
+    * `diag(a)`         -> `einsum("ii->i", a)`
+    * `sum(a, axis=0)`  -> `einsum("i...->", a)`
+    * Subscripts at the beginning and end may be specified
+    * by putting an ellipsis "..." in the middle.
+    * For instance, the function `einsum("i...i", a)` takes
+    * the diagonal of the first and last dimensions of
+    * the operand, and `einsum("ij...,jk...->ik...")` performs
+    * the matrix product using the first two indices
+    * of each operand instead of the last two.
+    * When there is only one operand, no axes being summed,
+    *  and no output parameter, this function returns
+    * a view into the operand instead of creating a copy.
+     */
+    class CV_EXPORTS EinsumLayer : public Layer
+    {
+    public:
+        static Ptr<EinsumLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS BaseConvolutionLayer : public Layer
     {
     public:
@@ -256,6 +289,9 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
+        bool fusedActivation = false;
+        bool fusedAdd = false;
+        bool useWinograd = true; // Flag whether to use Winograd to speed up 3x3 convolution.
     };
 
     class CV_EXPORTS ConvolutionLayerInt8 : public BaseConvolutionLayer
@@ -263,6 +299,11 @@ CV__DNN_INLINE_NS_BEGIN
     public:
         int input_zp, output_zp;
         float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
+        bool useWinograd = false; // Flag whether to use Winograd to speed up 3x3 convolution.
         static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
@@ -294,6 +335,30 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<ArgLayer> create(const LayerParams& params);
     };
 
+    /** @brief Gather layer
+     */
+    class CV_EXPORTS GatherLayer : public Layer
+    {
+    public:
+        static Ptr<GatherLayer> create(const LayerParams& params);
+    };
+
+    /** @brief GatherElements layer
+    * GatherElements takes two inputs data and indices of the same rank r >= 1 and an optional attribute axis and works such that:
+    *   output[i][j][k] = data[index[i][j][k]][j][k] if axis = 0 and r = 3
+    *   output[i][j][k] = data[i][index[i][j][k]][k] if axis = 1 and r = 3
+    *   output[i][j][k] = data[i][j][index[i][j][k]] if axis = 2 and r = 3
+    *
+    * Gather, on the other hand, takes a data tensor of rank r >= 1, and indices tensor of rank q, and works such that:
+    *   it gathers the enteries along axis dimension of the input data indexed by indices and concatenates them in an output tensor of rank q + (r - 1)
+    *   e.g. If axis = 0, let k = indices[i_{0}, ..., i_{q-1}] then output[i_{0}, ..., i_{q-1}, j_{0}, ..., j_{r-2}] = input[k , j_{0}, ..., j_{r-2}]:
+     **/
+    class CV_EXPORTS GatherElementsLayer : public Layer
+    {
+    public:
+        static Ptr<GatherElementsLayer> create(const LayerParams& params);
+    };
+
     class CV_EXPORTS PoolingLayer : public Layer
     {
     public:
@@ -329,17 +394,9 @@ CV__DNN_INLINE_NS_BEGIN
     class CV_EXPORTS ReduceLayer : public Layer
     {
     public:
-        int reduceType;
-        std::vector<size_t> reduceDims;
         static Ptr<ReduceLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS ReduceLayerInt8 : public ReduceLayer
-    {
-    public:
-        static Ptr<ReduceLayerInt8> create(const LayerParams& params);
-    };
-
     class CV_EXPORTS SoftmaxLayer : public Layer
     {
     public:
@@ -356,6 +413,10 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<SoftmaxLayerInt8> create(const LayerParams& params);
     };
 
+    /**
+     * `InnerProduct`, `MatMul` and `Gemm` operations are all implemented by Fully Connected Layer.
+     * Parameter `is_matmul` is used to distinguish `MatMul` and `Gemm` from `InnerProduct`.
+     */
     class CV_EXPORTS InnerProductLayer : public Layer
     {
     public:
@@ -368,6 +429,10 @@ CV__DNN_INLINE_NS_BEGIN
     public:
         int input_zp, output_zp;
         float input_sc, output_sc;
+
+        // quantization type flag. The perChannel default is true, that means it contains the parameters
+        // of per-Channel quantization. Otherwise, that means this layer contains per-Tensor quantized parameters.
+        bool per_channel;
         static Ptr<InnerProductLayerInt8> create(const LayerParams& params);
     };
 
@@ -400,16 +465,16 @@ CV__DNN_INLINE_NS_BEGIN
     class CV_EXPORTS QuantizeLayer : public Layer
     {
     public:
-        float scale;
-        int zeropoint;
+        std::vector<float> scales;
+        std::vector<int> zeropoints;
         static Ptr<QuantizeLayer> create(const LayerParams &params);
     };
 
     class CV_EXPORTS DequantizeLayer : public Layer
     {
     public:
-        float scale;
-        int zeropoint;
+        std::vector<float> scales;
+        std::vector<int> zeropoints;
         static Ptr<DequantizeLayer> create(const LayerParams &params);
     };
 
@@ -539,11 +604,11 @@ CV__DNN_INLINE_NS_BEGIN
     {
     public:
         virtual void forwardSlice(const float* src, float* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int* src, const int* lut, int* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
         virtual void forwardSlice(const int8_t* src, const int8_t* lut, int8_t* dst, int len,
-                                  size_t outPlaneSize, int cn0, int cn1) const {};
+                                  size_t outPlaneSize, int cn0, int cn1) const {}
     };
 
     class CV_EXPORTS ReLULayer : public ActivationLayer
@@ -780,6 +845,18 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<SeluLayer> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS GeluLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<GeluLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GeluApproximationLayer : public ActivationLayer
+    {
+    public:
+        static Ptr<GeluApproximationLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS ThresholdedReluLayer : public ActivationLayer
     {
     public:
@@ -841,6 +918,12 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<EltwiseLayerInt8> create(const LayerParams &params);
     };
 
+    class CV_EXPORTS NaryEltwiseLayer : public Layer
+    {
+    public:
+        static Ptr<NaryEltwiseLayer> create(const LayerParams &params);
+    };
+
     class CV_EXPORTS BatchNormLayer : public ActivationLayer
     {
     public:
@@ -1039,6 +1122,88 @@ CV__DNN_INLINE_NS_BEGIN
         static Ptr<CumSumLayer> create(const LayerParams& params);
     };
 
+    class CV_EXPORTS ScatterLayer : public Layer
+    {
+    public:
+        static Ptr<ScatterLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ScatterNDLayer : public Layer
+    {
+    public:
+        static Ptr<ScatterNDLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS TileLayer : public Layer
+    {
+    public:
+        static Ptr<TileLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS LayerNormLayer : public Layer
+    {
+    public:
+        CV_DEPRECATED_EXTERNAL bool hasBias; // Deprecated, preserve for compatibility
+        int axis;
+        float epsilon;
+
+        static Ptr<LayerNormLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS GemmLayer : public Layer {
+    public:
+        bool trans_a;
+        bool trans_b;
+        float alpha;
+        float beta;
+
+        static Ptr<GemmLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS MatMulLayer : public Layer {
+     public:
+        static Ptr<MatMulLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ExpandLayer : public Layer
+    {
+    public:
+        static Ptr<ExpandLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS InstanceNormLayer : public Layer {
+    public:
+        float epsilon;
+
+        static Ptr<InstanceNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS AttentionLayer : public Layer {
+     public:
+        static Ptr<AttentionLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS GroupNormLayer : public Layer {
+    public:
+        static Ptr<GroupNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS DepthToSpaceLayer : public Layer {
+    public:
+        static Ptr<DepthToSpaceLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS SpaceToDepthLayer : public Layer {
+    public:
+        static Ptr<SpaceToDepthLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS TopKLayer : public Layer
+    {
+    public:
+        static Ptr<TopKLayer> create(const LayerParams& params);
+    };
+
 //! @}
 //! @}
 CV__DNN_INLINE_NS_END
diff --git a/3rdParty/opencv2/dnn/dnn.hpp b/3rdParty/opencv2/dnn/dnn.hpp
index 7132f64e9c..105e973193 100644
--- a/3rdParty/opencv2/dnn/dnn.hpp
+++ b/3rdParty/opencv2/dnn/dnn.hpp
@@ -52,6 +52,11 @@
 
 namespace cv {
 namespace dnn {
+
+namespace accessor {
+class DnnNetAccessor;  // forward declaration
+}
+
 CV__DNN_INLINE_NS_BEGIN
 //! @addtogroup dnn
 //! @{
@@ -64,21 +69,22 @@ CV__DNN_INLINE_NS_BEGIN
      */
     enum Backend
     {
-        //! DNN_BACKEND_DEFAULT equals to DNN_BACKEND_INFERENCE_ENGINE if
-        //! OpenCV is built with Intel's Inference Engine library or
-        //! DNN_BACKEND_OPENCV otherwise.
+        //! DNN_BACKEND_DEFAULT equals to OPENCV_DNN_BACKEND_DEFAULT, which can be defined using CMake or a configuration parameter
         DNN_BACKEND_DEFAULT = 0,
         DNN_BACKEND_HALIDE,
-        DNN_BACKEND_INFERENCE_ENGINE,            //!< Intel's Inference Engine computational backend
-                                                 //!< @sa setInferenceEngineBackendType
+        DNN_BACKEND_INFERENCE_ENGINE,            //!< Intel OpenVINO computational backend
+                                                 //!< @note Tutorial how to build OpenCV with OpenVINO: @ref tutorial_dnn_openvino
         DNN_BACKEND_OPENCV,
         DNN_BACKEND_VKCOM,
         DNN_BACKEND_CUDA,
         DNN_BACKEND_WEBNN,
         DNN_BACKEND_TIMVX,
-#ifdef __OPENCV_BUILD
+        DNN_BACKEND_CANN,
+#if defined(__OPENCV_BUILD) || defined(BUILD_PLUGIN)
+#if !defined(OPENCV_BINDING_PARSER)
         DNN_BACKEND_INFERENCE_ENGINE_NGRAPH = 1000000,     // internal - use DNN_BACKEND_INFERENCE_ENGINE + setInferenceEngineBackendType()
         DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019,      // internal - use DNN_BACKEND_INFERENCE_ENGINE + setInferenceEngineBackendType()
+#endif
 #endif
     };
 
@@ -98,6 +104,22 @@ CV__DNN_INLINE_NS_BEGIN
         DNN_TARGET_CUDA_FP16,
         DNN_TARGET_HDDL,
         DNN_TARGET_NPU,
+        DNN_TARGET_CPU_FP16, // Only the ARM platform is supported. Low precision computing, accelerate model inference.
+    };
+
+    /**
+     * @brief Enum of data layout for model inference.
+     * @see Image2BlobParams
+     */
+    enum DataLayout
+    {
+        DNN_LAYOUT_UNKNOWN = 0,
+        DNN_LAYOUT_ND = 1,        //!< OpenCV data layout for 2D data.
+        DNN_LAYOUT_NCHW = 2,      //!< OpenCV data layout for 4D data.
+        DNN_LAYOUT_NCDHW = 3,      //!< OpenCV data layout for 5D data.
+        DNN_LAYOUT_NHWC = 4,      //!< Tensorflow-like data layout for 4D data.
+        DNN_LAYOUT_NDHWC = 5,      //!< Tensorflow-like data layout for 5D data.
+        DNN_LAYOUT_PLANAR = 6,     //!< Tensorflow-like data layout, it should only be used at tf or tflite model parsing.
     };
 
     CV_EXPORTS std::vector< std::pair<Backend, Target> > getAvailableBackends();
@@ -192,7 +214,7 @@ CV__DNN_INLINE_NS_BEGIN
 
     /** @brief This interface class allows to build new Layers - are building blocks of networks.
      *
-     * Each class, derived from Layer, must implement allocate() methods to declare own outputs and forward() to compute outputs.
+     * Each class, derived from Layer, must implement forward() method to compute outputs.
      * Also before using the new layer into networks you must register your layer by using one of @ref dnnLayerFactory "LayerFactory" macros.
      */
     class CV_EXPORTS_W Layer : public Algorithm
@@ -207,7 +229,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param[in]  input  vector of already allocated input blobs
          *  @param[out] output vector of already allocated output blobs
          *
-         * If this method is called after network has allocated all memory for input and output blobs
+         * This method is called after network has allocated all memory for input and output blobs
          * and before inferencing.
          */
         CV_DEPRECATED_EXTERNAL
@@ -217,7 +239,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param[in]  inputs  vector of already allocated input blobs
          *  @param[out] outputs vector of already allocated output blobs
          *
-         * If this method is called after network has allocated all memory for input and output blobs
+         * This method is called after network has allocated all memory for input and output blobs
          * and before inferencing.
          */
         CV_WRAP virtual void finalize(InputArrayOfArrays inputs, OutputArrayOfArrays outputs);
@@ -306,7 +328,7 @@ CV__DNN_INLINE_NS_BEGIN
 
         virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs, const std::vector<Ptr<BackendNode> >& nodes);
 
-        virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs);
+        virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs, std::vector<Ptr<BackendWrapper> > &outputs);
 
         virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> > &inputs, const std::vector<Ptr<BackendNode> >& nodes);
 
@@ -336,6 +358,17 @@ CV__DNN_INLINE_NS_BEGIN
                                            const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
                                            bool isLast);
 
+        /**
+         * @brief Returns a CANN backend node
+         *
+         * @param   inputs   input tensors of CANN operator
+         * @param   outputs  output tensors of CANN operator
+         * @param   nodes           nodes of input tensors
+         */
+        virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputs,
+                                          const std::vector<Ptr<BackendWrapper> > &outputs,
+                                          const std::vector<Ptr<BackendNode> >& nodes);
+
        /**
         * @brief Automatic Halide scheduling based on layer hyper-parameters.
         * @param[in] node Backend node with Halide functions.
@@ -451,7 +484,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  Networks imported from Intel's Model Optimizer are launched in Intel's Inference Engine
          *  backend.
          */
-        CV_WRAP static Net readFromModelOptimizer(const String& xml, const String& bin);
+        CV_WRAP static Net readFromModelOptimizer(CV_WRAP_FILE_PATH const String& xml, CV_WRAP_FILE_PATH const String& bin);
 
         /** @brief Create a network from Intel's Model Optimizer in-memory buffers with intermediate representation (IR).
          *  @param[in] bufferModelConfig buffer with model's configuration.
@@ -484,7 +517,15 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param path   path to output file with .dot extension
          *  @see dump()
          */
-        CV_WRAP void dumpToFile(const String& path);
+        CV_WRAP void dumpToFile(CV_WRAP_FILE_PATH const String& path);
+        /** @brief Dump net structure, hyperparameters, backend, target and fusion to pbtxt file
+         *  @param path   path to output file with .pbtxt extension
+         *
+         *  Use Netron (https://netron.app) to open the target file to visualize the model.
+         *  Call method after setInput(). To see correct backend, target and fusion run after forward().
+        */
+        CV_WRAP void dumpToPbtxt(CV_WRAP_FILE_PATH const String& path);
+
         /** @brief Adds new layer to the net.
          *  @param name   unique name of the adding layer.
          *  @param type   typename of the adding layer (type must be registered in LayerRegister).
@@ -492,7 +533,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param params parameters which will be used to initialize the creating layer.
          *  @returns unique identifier of created layer, or -1 if a failure will happen.
          */
-        int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
+        CV_WRAP int addLayer(const String &name, const String &type, const int &dtype, LayerParams &params);
 
         /** @overload Datatype of output blobs set to default CV_32F */
         int addLayer(const String &name, const String &type, LayerParams &params);
@@ -500,7 +541,7 @@ CV__DNN_INLINE_NS_BEGIN
         /** @brief Adds new layer and connects its first input to the first output of previously added layer.
          *  @see addLayer()
          */
-        int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
+        CV_WRAP int addLayerToPrev(const String &name, const String &type, const int &dtype, LayerParams &params);
 
         /** @overload */
         int addLayerToPrev(const String &name, const String &type, LayerParams &params);
@@ -601,13 +642,13 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param outputName name for layer which output is needed to get
          *  @details If @p outputName is empty, runs forward pass for the whole network.
          */
-        CV_WRAP void forward(OutputArrayOfArrays outputBlobs, const String& outputName = String());
+        CV_WRAP void forward(CV_ND OutputArrayOfArrays outputBlobs, const String& outputName = String());
 
         /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
          *  @param outputBlobs contains blobs for first outputs of specified layers.
          *  @param outBlobNames names for layers which outputs are needed to get
          */
-        CV_WRAP void forward(OutputArrayOfArrays outputBlobs,
+        CV_WRAP void forward(CV_ND OutputArrayOfArrays outputBlobs,
                              const std::vector<String>& outBlobNames);
 
         /** @brief Runs forward pass to compute outputs of layers listed in @p outBlobNames.
@@ -621,8 +662,10 @@ CV__DNN_INLINE_NS_BEGIN
          *  @param calibData Calibration data to compute the quantization parameters.
          *  @param inputsDtype Datatype of quantized net's inputs. Can be CV_32F or CV_8S.
          *  @param outputsDtype Datatype of quantized net's outputs. Can be CV_32F or CV_8S.
+         *  @param perChannel Quantization granularity of quantized Net. The default is true, that means quantize model
+         *  in per-channel way (channel-wise). Set it false to quantize model in per-tensor way (or tensor-wise).
          */
-        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype);
+        CV_WRAP Net quantize(InputArrayOfArrays calibData, int inputsDtype, int outputsDtype, bool perChannel=true);
 
         /** @brief Returns input scale and zeropoint for a quantized Net.
          *  @param scales output parameter for returning input scales.
@@ -651,9 +694,6 @@ CV__DNN_INLINE_NS_BEGIN
          * @brief Ask network to use specific computation backend where it supported.
          * @param[in] backendId backend identifier.
          * @see Backend
-         *
-         * If OpenCV is compiled with Intel's Inference Engine library, DNN_BACKEND_DEFAULT
-         * means DNN_BACKEND_INFERENCE_ENGINE. Otherwise it equals to DNN_BACKEND_OPENCV.
          */
         CV_WRAP void setPreferableBackend(int backendId);
 
@@ -687,7 +727,7 @@ CV__DNN_INLINE_NS_BEGIN
          *  as:
          * \f[input(n,c,h,w) = scalefactor \times (blob(n,c,h,w) - mean_c)\f]
          */
-        CV_WRAP void setInput(InputArray blob, const String& name = "",
+        CV_WRAP void setInput(CV_ND InputArray blob, const String& name = "",
                               double scalefactor = 1.0, const Scalar& mean = Scalar());
 
         /** @brief Sets the new value for the learned param of the layer.
@@ -698,8 +738,8 @@ CV__DNN_INLINE_NS_BEGIN
          *  @note If shape of the new blob differs from the previous shape,
          *  then the following forward pass may fail.
         */
-        CV_WRAP void setParam(int layer, int numParam, const Mat &blob);
-        CV_WRAP inline void setParam(const String& layerName, int numParam, const Mat &blob) { return setParam(getLayerId(layerName), numParam, blob); }
+        CV_WRAP void setParam(int layer, int numParam, CV_ND const Mat &blob);
+        CV_WRAP inline void setParam(const String& layerName, int numParam, CV_ND const Mat &blob) { return setParam(getLayerId(layerName), numParam, blob); }
 
         /** @brief Returns parameter blob of the layer.
          *  @param layer name or id of the layer.
@@ -828,6 +868,12 @@ CV__DNN_INLINE_NS_BEGIN
          */
         CV_WRAP void enableFusion(bool fusion);
 
+        /** @brief Enables or disables the Winograd compute branch. The Winograd compute branch can speed up
+         * 3x3 Convolution at a small loss of accuracy.
+        * @param useWinograd true to enable the Winograd compute branch. The default is true.
+        */
+        CV_WRAP void enableWinograd(bool useWinograd);
+
         /** @brief Returns overall time for inference and timings (in ticks) for layers.
          *
          * Indexes in returned vector correspond to layers ids. Some layers can be fused with others,
@@ -838,8 +884,12 @@ CV__DNN_INLINE_NS_BEGIN
          */
         CV_WRAP int64 getPerfProfile(CV_OUT std::vector<double>& timings);
 
-    private:
+
         struct Impl;
+        inline Impl* getImpl() const { return impl.get(); }
+        inline Impl& getImplRef() const { CV_DbgAssert(impl); return *impl.get(); }
+        friend class accessor::DnnNetAccessor;
+    protected:
         Ptr<Impl> impl;
     };
 
@@ -847,9 +897,8 @@ CV__DNN_INLINE_NS_BEGIN
     *  @param cfgFile      path to the .cfg file with text description of the network architecture.
     *  @param darknetModel path to the .weights file with learned network.
     *  @returns Network object that ready to do forward, throw an exception in failure cases.
-    *  @returns Net object.
     */
-    CV_EXPORTS_W Net readNetFromDarknet(const String &cfgFile, const String &darknetModel = String());
+    CV_EXPORTS_W Net readNetFromDarknet(CV_WRAP_FILE_PATH const String &cfgFile, CV_WRAP_FILE_PATH const String &darknetModel = String());
 
     /** @brief Reads a network model stored in <a href="https://pjreddie.com/darknet/">Darknet</a> model files.
      *  @param bufferCfg   A buffer contains a content of .cfg file with text description of the network architecture.
@@ -874,7 +923,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @param caffeModel path to the .caffemodel file with learned network.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromCaffe(const String &prototxt, const String &caffeModel = String());
+    CV_EXPORTS_W Net readNetFromCaffe(CV_WRAP_FILE_PATH const String &prototxt, CV_WRAP_FILE_PATH const String &caffeModel = String());
 
     /** @brief Reads a network model stored in Caffe model in memory.
       * @param bufferProto buffer containing the content of the .prototxt file
@@ -903,7 +952,7 @@ CV__DNN_INLINE_NS_BEGIN
       *               let us make it more flexible.
       * @returns Net object.
       */
-    CV_EXPORTS_W Net readNetFromTensorflow(const String &model, const String &config = String());
+    CV_EXPORTS_W Net readNetFromTensorflow(CV_WRAP_FILE_PATH const String &model, CV_WRAP_FILE_PATH const String &config = String());
 
     /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/">TensorFlow</a> framework's format.
       * @param bufferModel buffer containing the content of the pb file
@@ -924,6 +973,26 @@ CV__DNN_INLINE_NS_BEGIN
     CV_EXPORTS Net readNetFromTensorflow(const char *bufferModel, size_t lenModel,
                                          const char *bufferConfig = NULL, size_t lenConfig = 0);
 
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @param model  path to the .tflite file with binary flatbuffers description of the network architecture
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTFLite(CV_WRAP_FILE_PATH const String &model);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @param bufferModel buffer containing the content of the tflite file
+      * @returns Net object.
+      */
+    CV_EXPORTS_W Net readNetFromTFLite(const std::vector<uchar>& bufferModel);
+
+    /** @brief Reads a network model stored in <a href="https://www.tensorflow.org/lite">TFLite</a> framework's format.
+      * @details This is an overloaded member function, provided for convenience.
+      * It differs from the above function only in what argument(s) it accepts.
+      * @param bufferModel buffer containing the content of the tflite file
+      * @param lenModel length of bufferModel
+      */
+    CV_EXPORTS Net readNetFromTFLite(const char *bufferModel, size_t lenModel);
+
     /**
      *  @brief Reads a network model stored in <a href="http://torch.ch">Torch7</a> framework's format.
      *  @param model    path to the file, dumped from Torch by using torch.save() function.
@@ -950,7 +1019,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
      */
-     CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true);
+     CV_EXPORTS_W Net readNetFromTorch(CV_WRAP_FILE_PATH const String &model, bool isBinary = true, bool evaluate = true);
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
@@ -960,14 +1029,14 @@ CV__DNN_INLINE_NS_BEGIN
       *                  * `*.pb` (TensorFlow, https://www.tensorflow.org/)
       *                  * `*.t7` | `*.net` (Torch, http://torch.ch/)
       *                  * `*.weights` (Darknet, https://pjreddie.com/darknet/)
-      *                  * `*.bin` (DLDT, https://software.intel.com/openvino-toolkit)
+      *                  * `*.bin` | `*.onnx` (OpenVINO, https://software.intel.com/openvino-toolkit)
       *                  * `*.onnx` (ONNX, https://onnx.ai/)
       * @param[in] config Text file contains network configuration. It could be a
       *                   file with the following extensions:
       *                  * `*.prototxt` (Caffe, http://caffe.berkeleyvision.org/)
       *                  * `*.pbtxt` (TensorFlow, https://www.tensorflow.org/)
       *                  * `*.cfg` (Darknet, https://pjreddie.com/darknet/)
-      *                  * `*.xml` (DLDT, https://software.intel.com/openvino-toolkit)
+      *                  * `*.xml` (OpenVINO, https://software.intel.com/openvino-toolkit)
       * @param[in] framework Explicit framework name tag to determine a format.
       * @returns Net object.
       *
@@ -976,7 +1045,7 @@ CV__DNN_INLINE_NS_BEGIN
       * @ref readNetFromTorch or @ref readNetFromDarknet. An order of @p model and @p config
       * arguments does not matter.
       */
-     CV_EXPORTS_W Net readNet(const String& model, const String& config = "", const String& framework = "");
+     CV_EXPORTS_W Net readNet(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "", const String& framework = "");
 
      /**
       * @brief Read deep learning network represented in one of the supported formats.
@@ -1003,7 +1072,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  backend.
      */
     CV_EXPORTS_W
-    Net readNetFromModelOptimizer(const String &xml, const String &bin);
+    Net readNetFromModelOptimizer(CV_WRAP_FILE_PATH const String &xml, CV_WRAP_FILE_PATH const String &bin = "");
 
     /** @brief Load a network from Intel's Model Optimizer intermediate representation.
      *  @param[in] bufferModelConfig Buffer contains XML configuration with network's topology.
@@ -1032,7 +1101,7 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param onnxFile path to the .onnx file with text description of the network architecture.
      *  @returns Network object that ready to do forward, throw an exception in failure cases.
      */
-    CV_EXPORTS_W Net readNetFromONNX(const String &onnxFile);
+    CV_EXPORTS_W Net readNetFromONNX(CV_WRAP_FILE_PATH const String &onnxFile);
 
     /** @brief Reads a network model from <a href="https://onnx.ai/">ONNX</a>
      *         in-memory buffer.
@@ -1055,15 +1124,15 @@ CV__DNN_INLINE_NS_BEGIN
      *  @param path to the .pb file with input tensor.
      *  @returns Mat.
      */
-    CV_EXPORTS_W Mat readTensorFromONNX(const String& path);
+    CV_EXPORTS_W Mat readTensorFromONNX(CV_WRAP_FILE_PATH const String& path);
 
     /** @brief Creates 4-dimensional blob from image. Optionally resizes and crops @p image from center,
      *  subtract @p mean values, scales values by @p scalefactor, swap Blue and Red channels.
      *  @param image input image (with 1-, 3- or 4-channels).
+     *  @param scalefactor multiplier for @p images values.
      *  @param size spatial size for output image
      *  @param mean scalar with mean values which are subtracted from channels. Values are intended
      *  to be in (mean-R, mean-G, mean-B) order if @p image has BGR ordering and @p swapRB is true.
-     *  @param scalefactor multiplier for @p image values.
      *  @param swapRB flag which indicates that swap first and last channels
      *  in 3-channel image is necessary.
      *  @param crop flag which indicates whether image will be cropped after resize or not
@@ -1072,6 +1141,9 @@ CV__DNN_INLINE_NS_BEGIN
      *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
      *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
      *  @returns 4-dimensional Mat with NCHW dimensions order.
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
      */
     CV_EXPORTS_W Mat blobFromImage(InputArray image, double scalefactor=1.0, const Size& size = Size(),
                                    const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
@@ -1102,6 +1174,9 @@ CV__DNN_INLINE_NS_BEGIN
      *  dimension in @p size and another one is equal or larger. Then, crop from the center is performed.
      *  If @p crop is false, direct resize without cropping and preserving aspect ratio is performed.
      *  @returns 4-dimensional Mat with NCHW dimensions order.
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
      */
     CV_EXPORTS_W Mat blobFromImages(InputArrayOfArrays images, double scalefactor=1.0,
                                     Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
@@ -1116,6 +1191,89 @@ CV__DNN_INLINE_NS_BEGIN
                                    const Scalar& mean = Scalar(), bool swapRB=false, bool crop=false,
                                    int ddepth=CV_32F);
 
+    /**
+     * @brief Enum of image processing mode.
+     * To facilitate the specialization pre-processing requirements of the dnn model.
+     * For example, the `letter box` often used in the Yolo series of models.
+     * @see Image2BlobParams
+     */
+    enum ImagePaddingMode
+    {
+        DNN_PMODE_NULL = 0,        // !< Default. Resize to required input size without extra processing.
+        DNN_PMODE_CROP_CENTER = 1, // !< Image will be cropped after resize.
+        DNN_PMODE_LETTERBOX = 2,   // !< Resize image to the desired size while preserving the aspect ratio of original image.
+    };
+
+    /** @brief Processing params of image to blob.
+     *
+     * It includes all possible image processing operations and corresponding parameters.
+     *
+     * @see blobFromImageWithParams
+     *
+     * @note
+     * The order and usage of `scalefactor` and `mean` are (input - mean) * scalefactor.
+     * The order and usage of `scalefactor`, `size`, `mean`, `swapRB`, and `ddepth` are consistent
+     * with the function of @ref blobFromImage.
+    */
+    struct CV_EXPORTS_W_SIMPLE Image2BlobParams
+    {
+        CV_WRAP Image2BlobParams();
+        CV_WRAP Image2BlobParams(const Scalar& scalefactor, const Size& size = Size(), const Scalar& mean = Scalar(),
+                            bool swapRB = false, int ddepth = CV_32F, DataLayout datalayout = DNN_LAYOUT_NCHW,
+                            ImagePaddingMode mode = DNN_PMODE_NULL, Scalar borderValue = 0.0);
+
+        CV_PROP_RW Scalar scalefactor; //!< scalefactor multiplier for input image values.
+        CV_PROP_RW Size size;    //!< Spatial size for output image.
+        CV_PROP_RW Scalar mean;  //!< Scalar with mean values which are subtracted from channels.
+        CV_PROP_RW bool swapRB;  //!< Flag which indicates that swap first and last channels
+        CV_PROP_RW int ddepth;   //!< Depth of output blob. Choose CV_32F or CV_8U.
+        CV_PROP_RW DataLayout datalayout; //!< Order of output dimensions. Choose DNN_LAYOUT_NCHW or DNN_LAYOUT_NHWC.
+        CV_PROP_RW ImagePaddingMode paddingmode;   //!< Image padding mode. @see ImagePaddingMode.
+        CV_PROP_RW Scalar borderValue;   //!< Value used in padding mode for padding.
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param size original input image size.
+         *  @returns rectangle in original image coordinates.
+         */
+        CV_WRAP Rect blobRectToImageRect(const Rect &rBlob, const Size &size);
+
+        /** @brief Get rectangle coordinates in original image system from rectangle in blob coordinates.
+         *  @param rBlob rect in blob coordinates.
+         *  @param rImg result rect in image coordinates.
+         *  @param size original input image size.
+         */
+        CV_WRAP void blobRectsToImageRects(const std::vector<Rect> &rBlob, CV_OUT std::vector<Rect>& rImg, const Size& size);
+    };
+
+    /** @brief Creates 4-dimensional blob from image with given params.
+     *
+     *  @details This function is an extension of @ref blobFromImage to meet more image preprocess needs.
+     *  Given input image and preprocessing parameters, and function outputs the blob.
+     *
+     *  @param image input image (all with 1-, 3- or 4-channels).
+     *  @param param struct of Image2BlobParams, contains all parameters needed by processing of image to blob.
+     *  @return 4-dimensional Mat.
+     */
+    CV_EXPORTS_W Mat blobFromImageWithParams(InputArray image, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @overload */
+    CV_EXPORTS_W void blobFromImageWithParams(InputArray image, OutputArray blob, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @brief Creates 4-dimensional blob from series of images with given params.
+     *
+     *  @details This function is an extension of @ref blobFromImages to meet more image preprocess needs.
+     *  Given input image and preprocessing parameters, and function outputs the blob.
+     *
+     *  @param images input image (all with 1-, 3- or 4-channels).
+     *  @param param struct of Image2BlobParams, contains all parameters needed by processing of image to blob.
+     *  @returns 4-dimensional Mat.
+     */
+    CV_EXPORTS_W Mat blobFromImagesWithParams(InputArrayOfArrays images, const Image2BlobParams& param = Image2BlobParams());
+
+    /** @overload */
+    CV_EXPORTS_W void blobFromImagesWithParams(InputArrayOfArrays images, OutputArray blob, const Image2BlobParams& param = Image2BlobParams());
+
     /** @brief Parse a 4D blob and output the images it contains as 2D arrays through a simpler data structure
      *  (std::vector<cv::Mat>).
      *  @param[in] blob_ 4 dimensional array (images, channels, height, width) in floating point precision (CV_32F) from
@@ -1139,7 +1297,7 @@ CV__DNN_INLINE_NS_BEGIN
      *       is taken from NVidia's Caffe fork: https://github.com/NVIDIA/caffe.
      *       So the resulting model may be used there.
      */
-    CV_EXPORTS_W void shrinkCaffeModel(const String& src, const String& dst,
+    CV_EXPORTS_W void shrinkCaffeModel(CV_WRAP_FILE_PATH const String& src, CV_WRAP_FILE_PATH const String& dst,
                                        const std::vector<String>& layersTypes = std::vector<String>());
 
     /** @brief Create a text representation for a binary network stored in protocol buffer format.
@@ -1148,7 +1306,7 @@ CV__DNN_INLINE_NS_BEGIN
      *
      *  @note To reduce output file size, trained weights are not included.
      */
-    CV_EXPORTS_W void writeTextGraph(const String& model, const String& output);
+    CV_EXPORTS_W void writeTextGraph(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& output);
 
     /** @brief Performs non maximum suppression given boxes and corresponding scores.
 
@@ -1175,6 +1333,27 @@ CV__DNN_INLINE_NS_BEGIN
                              CV_OUT std::vector<int>& indices,
                              const float eta = 1.f, const int top_k = 0);
 
+    /** @brief Performs batched non maximum suppression on given boxes and corresponding scores across different classes.
+
+     * @param bboxes a set of bounding boxes to apply NMS.
+     * @param scores a set of corresponding confidences.
+     * @param class_ids a set of corresponding class ids. Ids are integer and usually start from 0.
+     * @param score_threshold a threshold used to filter boxes by score.
+     * @param nms_threshold a threshold used in non maximum suppression.
+     * @param indices the kept indices of bboxes after NMS.
+     * @param eta a coefficient in adaptive threshold formula: \f$nms\_threshold_{i+1}=eta\cdot nms\_threshold_i\f$.
+     * @param top_k if `>0`, keep at most @p top_k picked indices.
+     */
+    CV_EXPORTS void NMSBoxesBatched(const std::vector<Rect>& bboxes, const std::vector<float>& scores, const std::vector<int>& class_ids,
+                                    const float score_threshold, const float nms_threshold,
+                                    CV_OUT std::vector<int>& indices,
+                                    const float eta = 1.f, const int top_k = 0);
+
+    CV_EXPORTS_W void NMSBoxesBatched(const std::vector<Rect2d>& bboxes, const std::vector<float>& scores, const std::vector<int>& class_ids,
+                                      const float score_threshold, const float nms_threshold,
+                                      CV_OUT std::vector<int>& indices,
+                                      const float eta = 1.f, const int top_k = 0);
+
     /**
      * @brief Enum of Soft NMS methods.
      * @see softNMSBoxes
@@ -1232,7 +1411,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP Model(const String& model, const String& config = "");
+         CV_WRAP Model(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1261,7 +1440,7 @@ CV__DNN_INLINE_NS_BEGIN
          /** @brief Set scalefactor value for frame.
           *  @param[in] scale Multiplier for frame values.
          */
-         CV_WRAP Model& setInputScale(double scale);
+         CV_WRAP Model& setInputScale(const Scalar& scale);
 
          /** @brief Set flag crop for frame.
           *  @param[in] crop Flag which indicates whether image will be cropped after resize or not.
@@ -1273,6 +1452,11 @@ CV__DNN_INLINE_NS_BEGIN
          */
          CV_WRAP Model& setInputSwapRB(bool swapRB);
 
+         /** @brief Set output names for frame.
+          *  @param[in] outNames Names for output layers.
+         */
+         CV_WRAP Model& setOutputNames(const std::vector<String>& outNames);
+
          /** @brief Set preprocessing parameters for frame.
          *  @param[in] size New input size.
          *  @param[in] mean Scalar with mean values which are subtracted from channels.
@@ -1302,6 +1486,9 @@ CV__DNN_INLINE_NS_BEGIN
          /// @sa Net::setPreferableTarget
          CV_WRAP Model& setPreferableTarget(dnn::Target targetId);
 
+         /// @sa Net::enableWinograd
+         CV_WRAP Model& enableWinograd(bool useWinograd);
+
          CV_DEPRECATED_EXTERNAL
          operator Net&() const { return getNetwork_(); }
 
@@ -1334,7 +1521,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP ClassificationModel(const String& model, const String& config = "");
+          CV_WRAP ClassificationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1384,7 +1571,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP KeypointsModel(const String& model, const String& config = "");
+          CV_WRAP KeypointsModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1416,7 +1603,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-          CV_WRAP SegmentationModel(const String& model, const String& config = "");
+          CV_WRAP SegmentationModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1447,7 +1634,7 @@ CV__DNN_INLINE_NS_BEGIN
           * @param[in] model Binary file contains trained weights.
           * @param[in] config Text file contains network configuration.
           */
-         CV_WRAP DetectionModel(const String& model, const String& config = "");
+         CV_WRAP DetectionModel(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config = "");
 
          /**
           * @brief Create model from deep learning network.
@@ -1513,7 +1700,7 @@ class CV_EXPORTS_W_SIMPLE TextRecognitionModel : public Model
      * @param[in] config Text file contains network configuration
      */
     CV_WRAP inline
-    TextRecognitionModel(const std::string& model, const std::string& config = "")
+    TextRecognitionModel(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextRecognitionModel(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1668,7 +1855,7 @@ class CV_EXPORTS_W_SIMPLE TextDetectionModel_EAST : public TextDetectionModel
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_EAST(const std::string& model, const std::string& config = "")
+    TextDetectionModel_EAST(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_EAST(readNet(model, config)) { /* nothing */ }
 
     /**
@@ -1729,7 +1916,7 @@ class CV_EXPORTS_W_SIMPLE TextDetectionModel_DB : public TextDetectionModel
      * @param[in] config Text file contains network configuration.
      */
     CV_WRAP inline
-    TextDetectionModel_DB(const std::string& model, const std::string& config = "")
+    TextDetectionModel_DB(CV_WRAP_FILE_PATH const std::string& model, CV_WRAP_FILE_PATH const std::string& config = "")
         : TextDetectionModel_DB(readNet(model, config)) { /* nothing */ }
 
     CV_WRAP TextDetectionModel_DB& setBinaryThreshold(float binaryThreshold);
diff --git a/3rdParty/opencv2/dnn/shape_utils.hpp b/3rdParty/opencv2/dnn/shape_utils.hpp
index a436bed307..5391017440 100644
--- a/3rdParty/opencv2/dnn/shape_utils.hpp
+++ b/3rdParty/opencv2/dnn/shape_utils.hpp
@@ -43,7 +43,7 @@
 #define OPENCV_DNN_DNN_SHAPE_UTILS_HPP
 
 #include <opencv2/dnn/dnn.hpp>
-#include <opencv2/core/types_c.h>  // CV_MAX_DIM
+#include <opencv2/core/cvdef.h>  // CV_MAX_DIM
 #include <iostream>
 #include <ostream>
 #include <sstream>
@@ -160,22 +160,49 @@ static inline MatShape shape(int a0, int a1=-1, int a2=-1, int a3=-1)
 
 static inline int total(const MatShape& shape, int start = -1, int end = -1)
 {
-    if (start == -1) start = 0;
-    if (end == -1) end = (int)shape.size();
-
     if (shape.empty())
         return 0;
 
+    int dims = (int)shape.size();
+
+    if (start == -1) start = 0;
+    if (end == -1) end = dims;
+
+    CV_CheckLE(0, start, "");
+    CV_CheckLE(start, end, "");
+    CV_CheckLE(end, dims, "");
+
     int elems = 1;
-    CV_Assert(start <= (int)shape.size() && end <= (int)shape.size() &&
-              start <= end);
-    for(int i = start; i < end; i++)
+    for (int i = start; i < end; i++)
     {
         elems *= shape[i];
     }
     return elems;
 }
 
+// TODO: rename to countDimsElements()
+static inline int total(const Mat& mat, int start = -1, int end = -1)
+{
+    if (mat.empty())
+        return 0;
+
+    int dims = mat.dims;
+
+    if (start == -1) start = 0;
+    if (end == -1) end = dims;
+
+    CV_CheckLE(0, start, "");
+    CV_CheckLE(start, end, "");
+    CV_CheckLE(end, dims, "");
+
+    int elems = 1;
+    for (int i = start; i < end; i++)
+    {
+        elems *= mat.size[i];
+    }
+    return elems;
+}
+
 static inline MatShape concat(const MatShape& a, const MatShape& b)
 {
     MatShape c = a;
diff --git a/3rdParty/opencv2/dnn/version.hpp b/3rdParty/opencv2/dnn/version.hpp
index 3bd04dd27e..a651e05826 100644
--- a/3rdParty/opencv2/dnn/version.hpp
+++ b/3rdParty/opencv2/dnn/version.hpp
@@ -6,7 +6,7 @@
 #define OPENCV_DNN_VERSION_HPP
 
 /// Use with major OpenCV version only.
-#define OPENCV_DNN_API_VERSION 20220524
+#define OPENCV_DNN_API_VERSION 20241223
 
 #if !defined CV_DOXYGEN && !defined CV_STATIC_ANALYSIS && !defined CV_DNN_DONT_ADD_INLINE_NS
 #define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
diff --git a/3rdParty/opencv2/features2d.hpp b/3rdParty/opencv2/features2d.hpp
index 4b21d8ea5c..43dcd02ee3 100644
--- a/3rdParty/opencv2/features2d.hpp
+++ b/3rdParty/opencv2/features2d.hpp
@@ -56,15 +56,15 @@
     @defgroup features2d_main Feature Detection and Description
     @defgroup features2d_match Descriptor Matchers
 
-Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables you to
-easily switch between different algorithms solving the same problem. This section is devoted to
-matching descriptors that are represented as vectors in a multidimensional space. All objects that
-implement vector descriptor matchers inherit the DescriptorMatcher interface.
+    Matchers of keypoint descriptors in OpenCV have wrappers with a common interface that enables
+    you to easily switch between different algorithms solving the same problem. This section is
+    devoted to matching descriptors that are represented as vectors in a multidimensional space.
+    All objects that implement vector descriptor matchers inherit the DescriptorMatcher interface.
 
     @defgroup features2d_draw Drawing Function of Keypoints and Matches
     @defgroup features2d_category Object Categorization
 
-This section describes approaches based on local 2D features and used to categorize objects.
+    This section describes approaches based on local 2D features and used to categorize objects.
 
     @defgroup feature2d_hal Hardware Acceleration Layer
     @{
@@ -107,6 +107,10 @@ class CV_EXPORTS KeyPointsFilter
      * Remove keypoints from some image by mask for pixels of this image.
      */
     static void runByPixelsMask( std::vector<KeyPoint>& keypoints, const Mat& mask );
+    /*
+     * Remove objects from some image and a vector of points by mask for pixels of this image
+     */
+    static void runByPixelsMask2VectorPoint(std::vector<KeyPoint> &keypoints, std::vector<std::vector<Point> > &removeFrom, const Mat &mask);
     /*
      * Remove duplicated keypoints.
      */
@@ -212,7 +216,10 @@ class CV_EXPORTS_W Feature2D : public virtual Algorithm
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
 
     // see corresponding cv::Algorithm method
-    CV_WRAP inline void write(const Ptr<FileStorage>& fs, const String& name = String()) const { Algorithm::write(fs, name); }
+    CV_WRAP inline void write(FileStorage& fs, const String& name) const { Algorithm::write(fs, name); }
+#if CV_VERSION_MAJOR < 5
+    inline void write(const Ptr<FileStorage>& fs, const String& name) const { CV_Assert(fs); Algorithm::write(*fs, name); }
+#endif
 };
 
 /** Feature detectors in OpenCV have wrappers with a common interface that enables you to easily switch
@@ -279,10 +286,14 @@ class CV_EXPORTS_W SIFT : public Feature2D
 
     @param sigma The sigma of the Gaussian applied to the input image at the octave \#0. If your image
     is captured with a weak camera with soft lenses, you might want to reduce the number.
+
+    @param enable_precise_upscale Whether to enable precise upscaling in the scale pyramid, which maps
+    index \f$\texttt{x}\f$ to \f$\texttt{2x}\f$. This prevents localization bias. The option
+    is disabled by default.
     */
     CV_WRAP static Ptr<SIFT> create(int nfeatures = 0, int nOctaveLayers = 3,
         double contrastThreshold = 0.04, double edgeThreshold = 10,
-        double sigma = 1.6);
+        double sigma = 1.6, bool enable_precise_upscale = false);
 
     /** @brief Create SIFT with specified descriptorType.
     @param nfeatures The number of best features to retain. The features are ranked by their scores
@@ -306,12 +317,31 @@ class CV_EXPORTS_W SIFT : public Feature2D
     is captured with a weak camera with soft lenses, you might want to reduce the number.
 
     @param descriptorType The type of descriptors. Only CV_32F and CV_8U are supported.
+
+    @param enable_precise_upscale Whether to enable precise upscaling in the scale pyramid, which maps
+    index \f$\texttt{x}\f$ to \f$\texttt{2x}\f$. This prevents localization bias. The option
+    is disabled by default.
     */
     CV_WRAP static Ptr<SIFT> create(int nfeatures, int nOctaveLayers,
         double contrastThreshold, double edgeThreshold,
-        double sigma, int descriptorType);
+        double sigma, int descriptorType, bool enable_precise_upscale = false);
 
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setNFeatures(int maxFeatures) = 0;
+    CV_WRAP virtual int getNFeatures() const = 0;
+
+    CV_WRAP virtual void setNOctaveLayers(int nOctaveLayers) = 0;
+    CV_WRAP virtual int getNOctaveLayers() const = 0;
+
+    CV_WRAP virtual void setContrastThreshold(double contrastThreshold) = 0;
+    CV_WRAP virtual double getContrastThreshold() const = 0;
+
+    CV_WRAP virtual void setEdgeThreshold(double edgeThreshold) = 0;
+    CV_WRAP virtual double getEdgeThreshold() const = 0;
+
+    CV_WRAP virtual void setSigma(double sigma) = 0;
+    CV_WRAP virtual double getSigma() const = 0;
 };
 
 typedef SIFT SiftFeatureDetector;
@@ -367,14 +397,20 @@ class CV_EXPORTS_W BRISK : public Feature2D
     /** @brief Set detection threshold.
     @param threshold AGAST detection threshold score.
     */
-    CV_WRAP virtual void setThreshold(int threshold) { CV_UNUSED(threshold); return; }
-    CV_WRAP virtual int getThreshold() const { return -1; }
+    CV_WRAP virtual void setThreshold(int threshold) = 0;
+    CV_WRAP virtual int getThreshold() const = 0;
 
     /** @brief Set detection octaves.
     @param octaves detection octaves. Use 0 to do single scale.
     */
-    CV_WRAP virtual void setOctaves(int octaves) { CV_UNUSED(octaves); return; }
-    CV_WRAP virtual int getOctaves() const { return -1; }
+    CV_WRAP virtual void setOctaves(int octaves) = 0;
+    CV_WRAP virtual int getOctaves() const = 0;
+    /** @brief Set detection patternScale.
+    @param patternScale apply this scale to the pattern used for sampling the neighbourhood of a
+    keypoint.
+    */
+    CV_WRAP virtual void setPatternScale(float patternScale) = 0;
+    CV_WRAP virtual float getPatternScale() const = 0;
 };
 
 /** @brief Class implementing the ORB (*oriented BRIEF*) keypoint detector and descriptor extractor
@@ -507,15 +543,30 @@ class CV_EXPORTS_W MSER : public Feature2D
     CV_WRAP virtual void setMaxArea(int maxArea) = 0;
     CV_WRAP virtual int getMaxArea() const = 0;
 
+    CV_WRAP virtual void setMaxVariation(double maxVariation) = 0;
+    CV_WRAP virtual double getMaxVariation() const = 0;
+
+    CV_WRAP virtual void setMinDiversity(double minDiversity) = 0;
+    CV_WRAP virtual double getMinDiversity() const = 0;
+
+    CV_WRAP virtual void setMaxEvolution(int maxEvolution) = 0;
+    CV_WRAP virtual int getMaxEvolution() const = 0;
+
+    CV_WRAP virtual void setAreaThreshold(double areaThreshold) = 0;
+    CV_WRAP virtual double getAreaThreshold() const = 0;
+
+    CV_WRAP virtual void setMinMargin(double min_margin) = 0;
+    CV_WRAP virtual double getMinMargin() const = 0;
+
+    CV_WRAP virtual void setEdgeBlurSize(int edge_blur_size) = 0;
+    CV_WRAP virtual int getEdgeBlurSize() const = 0;
+
     CV_WRAP virtual void setPass2Only(bool f) = 0;
     CV_WRAP virtual bool getPass2Only() const = 0;
+
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
 };
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Wrapping class for feature detection using the FAST method. :
  */
@@ -572,10 +623,6 @@ detection, use cv.FAST.detect() method.
 CV_EXPORTS void FAST( InputArray image, CV_OUT std::vector<KeyPoint>& keypoints,
                       int threshold, bool nonmaxSuppression, FastFeatureDetector::DetectorType type );
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Wrapping class for feature detection using the AGAST method. :
  */
@@ -653,6 +700,9 @@ class CV_EXPORTS_W GFTTDetector : public Feature2D
     CV_WRAP virtual void setBlockSize(int blockSize) = 0;
     CV_WRAP virtual int getBlockSize() const = 0;
 
+    CV_WRAP virtual void setGradientSize(int gradientSize_) = 0;
+    CV_WRAP virtual int getGradientSize() = 0;
+
     CV_WRAP virtual void setHarrisDetector(bool val) = 0;
     CV_WRAP virtual bool getHarrisDetector() const = 0;
 
@@ -719,19 +769,22 @@ class CV_EXPORTS_W SimpleBlobDetector : public Feature2D
       CV_PROP_RW bool filterByConvexity;
       CV_PROP_RW float minConvexity, maxConvexity;
 
+      CV_PROP_RW bool collectContours;
+
       void read( const FileNode& fn );
       void write( FileStorage& fs ) const;
   };
 
   CV_WRAP static Ptr<SimpleBlobDetector>
     create(const SimpleBlobDetector::Params &parameters = SimpleBlobDetector::Params());
+
+  CV_WRAP virtual void setParams(const SimpleBlobDetector::Params& params ) = 0;
+  CV_WRAP virtual SimpleBlobDetector::Params getParams() const = 0;
+
   CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+  CV_WRAP virtual const std::vector<std::vector<cv::Point> >& getBlobContours() const;
 };
 
-//! @} features2d_main
-
-//! @addtogroup features2d_main
-//! @{
 
 /** @brief Class implementing the KAZE keypoint detector and descriptor extractor, described in @cite ABD12 .
 
@@ -824,11 +877,15 @@ class CV_EXPORTS_W AKAZE : public Feature2D
     @param nOctaveLayers Default number of sublevels per scale level
     @param diffusivity Diffusivity type. DIFF_PM_G1, DIFF_PM_G2, DIFF_WEICKERT or
     DIFF_CHARBONNIER
+    @param max_points Maximum amount of returned points. In case if image contains
+    more features, then the features with highest response are returned.
+    Negative value means no limitation.
      */
     CV_WRAP static Ptr<AKAZE> create(AKAZE::DescriptorType descriptor_type = AKAZE::DESCRIPTOR_MLDB,
                                      int descriptor_size = 0, int descriptor_channels = 3,
                                      float threshold = 0.001f, int nOctaves = 4,
-                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2);
+                                     int nOctaveLayers = 4, KAZE::DiffusivityType diffusivity = KAZE::DIFF_PM_G2,
+                                     int max_points = -1);
 
     CV_WRAP virtual void setDescriptorType(AKAZE::DescriptorType dtype) = 0;
     CV_WRAP virtual AKAZE::DescriptorType getDescriptorType() const = 0;
@@ -851,9 +908,11 @@ class CV_EXPORTS_W AKAZE : public Feature2D
     CV_WRAP virtual void setDiffusivity(KAZE::DiffusivityType diff) = 0;
     CV_WRAP virtual KAZE::DiffusivityType getDiffusivity() const = 0;
     CV_WRAP virtual String getDefaultName() const CV_OVERRIDE;
+
+    CV_WRAP virtual void setMaxPoints(int max_points) = 0;
+    CV_WRAP virtual int getMaxPoints() const = 0;
 };
 
-//! @} features2d_main
 
 /****************************************************************************************\
 *                                      Distance                                          *
@@ -918,6 +977,8 @@ struct L1
     }
 };
 
+//! @} features2d_main
+
 /****************************************************************************************\
 *                                  DescriptorMatcher                                     *
 \****************************************************************************************/
@@ -1124,7 +1185,10 @@ class CV_EXPORTS_W DescriptorMatcher : public Algorithm
 
 
     // see corresponding cv::Algorithm method
-    CV_WRAP inline void write(const Ptr<FileStorage>& fs, const String& name = String()) const { Algorithm::write(fs, name); }
+    CV_WRAP inline void write(FileStorage& fs, const String& name) const { Algorithm::write(fs, name); }
+#if CV_VERSION_MAJOR < 5
+    inline void write(const Ptr<FileStorage>& fs, const String& name) const { CV_Assert(fs); Algorithm::write(*fs, name); }
+#endif
 
 protected:
     /**
@@ -1356,6 +1420,9 @@ CV_EXPORTS_AS(drawMatchesKnn) void drawMatches( InputArray img1, const std::vect
 *   Functions to evaluate the feature detectors and [generic] descriptor extractors      *
 \****************************************************************************************/
 
+//! @addtogroup features2d_main
+//! @{
+
 CV_EXPORTS void evaluateFeatureDetector( const Mat& img1, const Mat& img2, const Mat& H1to2,
                                          std::vector<KeyPoint>* keypoints1, std::vector<KeyPoint>* keypoints2,
                                          float& repeatability, int& correspCount,
@@ -1368,6 +1435,8 @@ CV_EXPORTS void computeRecallPrecisionCurve( const std::vector<std::vector<DMatc
 CV_EXPORTS float getRecall( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
 CV_EXPORTS int getNearestPoint( const std::vector<Point2f>& recallPrecisionCurve, float l_precision );
 
+//! @}
+
 /****************************************************************************************\
 *                                     Bag of visual words                                *
 \****************************************************************************************/
@@ -1469,8 +1538,8 @@ class CV_EXPORTS_W BOWImgDescriptorExtractor
     @param dmatcher Descriptor matcher that is used to find the nearest word of the trained vocabulary
     for each keypoint descriptor of the image.
      */
-    CV_WRAP BOWImgDescriptorExtractor( const Ptr<DescriptorExtractor>& dextractor,
-                               const Ptr<DescriptorMatcher>& dmatcher );
+    CV_WRAP BOWImgDescriptorExtractor( const Ptr<Feature2D>& dextractor,
+                                       const Ptr<DescriptorMatcher>& dmatcher );
     /** @overload */
     BOWImgDescriptorExtractor( const Ptr<DescriptorMatcher>& dmatcher );
     virtual ~BOWImgDescriptorExtractor();
@@ -1528,8 +1597,6 @@ class CV_EXPORTS_W BOWImgDescriptorExtractor
 
 //! @} features2d_category
 
-//! @} features2d
-
 } /* namespace cv */
 
 #endif
diff --git a/3rdParty/opencv2/flann/any.h b/3rdParty/opencv2/flann/any.h
index 4166553a3d..bc7df1039c 100644
--- a/3rdParty/opencv2/flann/any.h
+++ b/3rdParty/opencv2/flann/any.h
@@ -19,16 +19,39 @@
 #include <ostream>
 #include <typeinfo>
 
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/utility.hpp"
+
 namespace cvflann
 {
 
 namespace anyimpl
 {
 
-struct bad_any_cast
+struct bad_any_cast : public std::exception
 {
+    bad_any_cast() = default;
+
+    bad_any_cast(const char* src, const char* dst)
+        : message_(cv::format("cvflann::bad_any_cast(from %s to %s)", src, dst)) {}
+
+
+    const char* what() const noexcept override
+    {
+        return message_.c_str();
+    }
+
+private:
+    std::string message_{"cvflann::bad_any_cast"};
 };
 
+#ifndef CV_THROW_IF_TYPE_MISMATCH
+#define CV_THROW_IF_TYPE_MISMATCH(src_type_info, dst_type_info) \
+    if ((src_type_info) != (dst_type_info)) \
+        throw cvflann::anyimpl::bad_any_cast((src_type_info).name(), \
+                                             (dst_type_info).name())
+#endif
+
 struct empty_any
 {
 };
@@ -271,7 +294,7 @@ struct any
     template<typename T>
     T& cast()
     {
-        if (policy->type() != typeid(T)) throw anyimpl::bad_any_cast();
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
         T* r = reinterpret_cast<T*>(policy->get_value(&object));
         return *r;
     }
@@ -280,7 +303,7 @@ struct any
     template<typename T>
     const T& cast() const
     {
-        if (policy->type() != typeid(T)) throw anyimpl::bad_any_cast();
+        CV_THROW_IF_TYPE_MISMATCH(policy->type(), typeid(T));
         const T* r = reinterpret_cast<const T*>(policy->get_value(&object));
         return *r;
     }
diff --git a/3rdParty/opencv2/flann/composite_index.h b/3rdParty/opencv2/flann/composite_index.h
index d69eeecca2..1edaf557a3 100644
--- a/3rdParty/opencv2/flann/composite_index.h
+++ b/3rdParty/opencv2/flann/composite_index.h
@@ -80,7 +80,6 @@ class CompositeIndex : public NNIndex<Distance>
      * @param inputData dataset containing the points to index
      * @param params Index parameters
      * @param d Distance functor
-     * @return
      */
     CompositeIndex(const Matrix<ElementType>& inputData, const IndexParams& params = CompositeIndexParams(),
                    Distance d = Distance()) : index_params_(params)
diff --git a/3rdParty/opencv2/flann/dist.h b/3rdParty/opencv2/flann/dist.h
index 3d97f2a7bc..d78c3ff65c 100644
--- a/3rdParty/opencv2/flann/dist.h
+++ b/3rdParty/opencv2/flann/dist.h
@@ -1,4 +1,4 @@
-﻿/***********************************************************************
+/***********************************************************************
  * Software License Agreement (BSD License)
  *
  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
@@ -49,7 +49,7 @@ typedef unsigned __int64 uint64_t;
 # include <Intrin.h>
 #endif
 
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
 # include "arm_neon.h"
 #endif
 
@@ -559,7 +559,7 @@ struct Hamming
     ResultType operator()(const Iterator1 a, const Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
     {
         ResultType result = 0;
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
         {
             const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
             const unsigned char* b2 = reinterpret_cast<const unsigned char*> (b);
@@ -611,7 +611,7 @@ struct Hamming
     {
         (void)b;
         ResultType result = 0;
-#if defined(__ARM_NEON__) && !defined(__CUDACC__)
+#if defined(__ARM_NEON) && !defined(__CUDACC__)
         {
             const unsigned char* a2 = reinterpret_cast<const unsigned char*> (a);
             uint32x4_t bits = vmovq_n_u32(0);
diff --git a/3rdParty/opencv2/flann/dynamic_bitset.h b/3rdParty/opencv2/flann/dynamic_bitset.h
index 849f95a4d8..a3f0be6fa8 100644
--- a/3rdParty/opencv2/flann/dynamic_bitset.h
+++ b/3rdParty/opencv2/flann/dynamic_bitset.h
@@ -97,7 +97,6 @@ class DynamicBitset
     }
 
     /** @brief set one bit to 0
-     * @param index
      */
     void reset(size_t index)
     {
@@ -108,7 +107,6 @@ class DynamicBitset
      * This function is useful when resetting a given set of bits so that the
      * whole bitset ends up being 0: if that's the case, we don't care about setting
      * other bits to 0
-     * @param index
      */
     void reset_block(size_t index)
     {
@@ -116,7 +114,6 @@ class DynamicBitset
     }
 
     /** resize the bitset so that it contains at least sz bits
-     * @param sz
      */
     void resize(size_t sz)
     {
diff --git a/3rdParty/opencv2/flann/flann_base.hpp b/3rdParty/opencv2/flann/flann_base.hpp
index 2cd1f2b0e1..a516dfa26e 100644
--- a/3rdParty/opencv2/flann/flann_base.hpp
+++ b/3rdParty/opencv2/flann/flann_base.hpp
@@ -45,6 +45,21 @@
 
 namespace cvflann
 {
+class FILEScopeGuard {
+
+public:
+    explicit FILEScopeGuard(FILE* file) {
+        file_ = file;
+    };
+
+    ~FILEScopeGuard() {
+        fclose(file_);
+    };
+
+private:
+    FILE* file_;
+};
+
 
 /**
  * Sets the log level used for all flann functions
@@ -69,7 +84,6 @@ struct SavedIndexParams : public IndexParams
     }
 };
 
-
 template<typename Distance>
 NNIndex<Distance>* load_saved_index(const Matrix<typename Distance::ElementType>& dataset, const cv::String& filename, Distance distance)
 {
@@ -79,13 +93,13 @@ NNIndex<Distance>* load_saved_index(const Matrix<typename Distance::ElementType>
     if (fin == NULL) {
         return NULL;
     }
+    FILEScopeGuard fscgd(fin);
+
     IndexHeader header = load_header(fin);
     if (header.data_type != Datatype<ElementType>::type()) {
-        fclose(fin);
         FLANN_THROW(cv::Error::StsError, "Datatype of saved index is different than of the one to be created.");
     }
     if ((size_t(header.rows) != dataset.rows)||(size_t(header.cols) != dataset.cols)) {
-        fclose(fin);
         FLANN_THROW(cv::Error::StsError, "The index saved belongs to a different dataset");
     }
 
@@ -93,7 +107,6 @@ NNIndex<Distance>* load_saved_index(const Matrix<typename Distance::ElementType>
     params["algorithm"] = header.index_type;
     NNIndex<Distance>* nnIndex = create_index_by_type<Distance>(dataset, params, distance);
     nnIndex->loadIndex(fin);
-    fclose(fin);
 
     return nnIndex;
 }
@@ -107,7 +120,7 @@ class Index : public NNIndex<Distance>
     typedef typename Distance::ResultType DistanceType;
 
     Index(const Matrix<ElementType>& features, const IndexParams& params, Distance distance = Distance() )
-        : index_params_(params)
+        :index_params_(params)
     {
         flann_algorithm_t index_type = get_param<flann_algorithm_t>(params,"algorithm");
         loaded_ = false;
diff --git a/3rdParty/opencv2/flann/general.h b/3rdParty/opencv2/flann/general.h
index 2dcaf4d073..eeb07e41f9 100644
--- a/3rdParty/opencv2/flann/general.h
+++ b/3rdParty/opencv2/flann/general.h
@@ -31,6 +31,8 @@
 #ifndef OPENCV_FLANN_GENERAL_H_
 #define OPENCV_FLANN_GENERAL_H_
 
+#include "opencv2/core/version.hpp"
+
 #if CV_VERSION_MAJOR <= 4
 
 //! @cond IGNORED
diff --git a/3rdParty/opencv2/flann/hdf5.h b/3rdParty/opencv2/flann/hdf5.h
deleted file mode 100644
index 08ad18e6f7..0000000000
--- a/3rdParty/opencv2/flann/hdf5.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/***********************************************************************
- * Software License Agreement (BSD License)
- *
- * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
- * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *************************************************************************/
-
-
-#ifndef OPENCV_FLANN_HDF5_H_
-#define OPENCV_FLANN_HDF5_H_
-
-//! @cond IGNORED
-
-#include <hdf5.h>
-
-#include "matrix.h"
-
-
-namespace cvflann
-{
-
-namespace
-{
-
-template<typename T>
-hid_t get_hdf5_type()
-{
-    throw FLANNException("Unsupported type for IO operations");
-}
-
-template<>
-hid_t get_hdf5_type<char>() { return H5T_NATIVE_CHAR; }
-template<>
-hid_t get_hdf5_type<unsigned char>() { return H5T_NATIVE_UCHAR; }
-template<>
-hid_t get_hdf5_type<short int>() { return H5T_NATIVE_SHORT; }
-template<>
-hid_t get_hdf5_type<unsigned short int>() { return H5T_NATIVE_USHORT; }
-template<>
-hid_t get_hdf5_type<int>() { return H5T_NATIVE_INT; }
-template<>
-hid_t get_hdf5_type<unsigned int>() { return H5T_NATIVE_UINT; }
-template<>
-hid_t get_hdf5_type<long>() { return H5T_NATIVE_LONG; }
-template<>
-hid_t get_hdf5_type<unsigned long>() { return H5T_NATIVE_ULONG; }
-template<>
-hid_t get_hdf5_type<float>() { return H5T_NATIVE_FLOAT; }
-template<>
-hid_t get_hdf5_type<double>() { return H5T_NATIVE_DOUBLE; }
-}
-
-
-#define CHECK_ERROR(x,y) if ((x)<0) throw FLANNException((y));
-
-template<typename T>
-void save_to_file(const cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-
-#if H5Eset_auto_vers == 2
-    H5Eset_auto( H5E_DEFAULT, NULL, NULL );
-#else
-    H5Eset_auto( NULL, NULL );
-#endif
-
-    herr_t status;
-    hid_t file_id;
-    file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    if (file_id < 0) {
-        file_id = H5Fcreate(filename.c_str(), H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
-    }
-    CHECK_ERROR(file_id,"Error creating hdf5 file.");
-
-    hsize_t     dimsf[2];              // dataset dimensions
-    dimsf[0] = dataset.rows;
-    dimsf[1] = dataset.cols;
-
-    hid_t space_id = H5Screate_simple(2, dimsf, NULL);
-    hid_t memspace_id = H5Screate_simple(2, dimsf, NULL);
-
-    hid_t dataset_id;
-#if H5Dcreate_vers == 2
-    dataset_id = H5Dcreate2(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
-#else
-    dataset_id = H5Dcreate(file_id, name.c_str(), get_hdf5_type<T>(), space_id, H5P_DEFAULT);
-#endif
-
-    if (dataset_id<0) {
-#if H5Dopen_vers == 2
-        dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-        dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    }
-    CHECK_ERROR(dataset_id,"Error creating or opening dataset in file.");
-
-    status = H5Dwrite(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, H5P_DEFAULT, dataset.data );
-    CHECK_ERROR(status, "Error writing to dataset");
-
-    H5Sclose(memspace_id);
-    H5Sclose(space_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-
-}
-
-
-template<typename T>
-void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-    herr_t status;
-    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, H5P_DEFAULT);
-    CHECK_ERROR(file_id,"Error opening hdf5 file.");
-
-    hid_t dataset_id;
-#if H5Dopen_vers == 2
-    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-    dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
-
-    hid_t space_id = H5Dget_space(dataset_id);
-
-    hsize_t dims_out[2];
-    H5Sget_simple_extent_dims(space_id, dims_out, NULL);
-
-    dataset = cvflann::Matrix<T>(new T[dims_out[0]*dims_out[1]], dims_out[0], dims_out[1]);
-
-    status = H5Dread(dataset_id, get_hdf5_type<T>(), H5S_ALL, H5S_ALL, H5P_DEFAULT, dataset[0]);
-    CHECK_ERROR(status, "Error reading dataset");
-
-    H5Sclose(space_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-}
-
-
-#ifdef HAVE_MPI
-
-namespace mpi
-{
-/**
- * Loads a the hyperslice corresponding to this processor from a hdf5 file.
- * @param flann_dataset Dataset where the data is loaded
- * @param filename HDF5 file name
- * @param name Name of dataset inside file
- */
-template<typename T>
-void load_from_file(cvflann::Matrix<T>& dataset, const String& filename, const String& name)
-{
-    MPI_Comm comm  = MPI_COMM_WORLD;
-    MPI_Info info  = MPI_INFO_NULL;
-
-    int mpi_size, mpi_rank;
-    MPI_Comm_size(comm, &mpi_size);
-    MPI_Comm_rank(comm, &mpi_rank);
-
-    herr_t status;
-
-    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, comm, info);
-    hid_t file_id = H5Fopen(filename.c_str(), H5F_ACC_RDWR, plist_id);
-    CHECK_ERROR(file_id,"Error opening hdf5 file.");
-    H5Pclose(plist_id);
-    hid_t dataset_id;
-#if H5Dopen_vers == 2
-    dataset_id = H5Dopen2(file_id, name.c_str(), H5P_DEFAULT);
-#else
-    dataset_id = H5Dopen(file_id, name.c_str());
-#endif
-    CHECK_ERROR(dataset_id,"Error opening dataset in file.");
-
-    hid_t space_id = H5Dget_space(dataset_id);
-    hsize_t dims[2];
-    H5Sget_simple_extent_dims(space_id, dims, NULL);
-
-    hsize_t count[2];
-    hsize_t offset[2];
-
-    hsize_t item_cnt = dims[0]/mpi_size+(dims[0]%mpi_size==0 ? 0 : 1);
-    hsize_t cnt = (mpi_rank<mpi_size-1 ? item_cnt : dims[0]-item_cnt*(mpi_size-1));
-
-    count[0] = cnt;
-    count[1] = dims[1];
-    offset[0] = mpi_rank*item_cnt;
-    offset[1] = 0;
-
-    hid_t memspace_id = H5Screate_simple(2,count,NULL);
-
-    H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL);
-
-    dataset.rows = count[0];
-    dataset.cols = count[1];
-    dataset.data = new T[dataset.rows*dataset.cols];
-
-    plist_id = H5Pcreate(H5P_DATASET_XFER);
-    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
-    status = H5Dread(dataset_id, get_hdf5_type<T>(), memspace_id, space_id, plist_id, dataset.data);
-    CHECK_ERROR(status, "Error reading dataset");
-
-    H5Pclose(plist_id);
-    H5Sclose(space_id);
-    H5Sclose(memspace_id);
-    H5Dclose(dataset_id);
-    H5Fclose(file_id);
-}
-}
-#endif // HAVE_MPI
-} // namespace cvflann::mpi
-
-//! @endcond
-
-#endif /* OPENCV_FLANN_HDF5_H_ */
diff --git a/3rdParty/opencv2/flann/logger.h b/3rdParty/opencv2/flann/logger.h
index afb62b3f6a..7708dfa2b8 100644
--- a/3rdParty/opencv2/flann/logger.h
+++ b/3rdParty/opencv2/flann/logger.h
@@ -101,7 +101,6 @@ class Logger
      * Print log message
      * @param level Log level
      * @param fmt Message format
-     * @return
      */
     static int log(int level, const char* fmt, ...)
     {
diff --git a/3rdParty/opencv2/flann/lsh_table.h b/3rdParty/opencv2/flann/lsh_table.h
index 989c5f98a2..9c72d11d52 100644
--- a/3rdParty/opencv2/flann/lsh_table.h
+++ b/3rdParty/opencv2/flann/lsh_table.h
@@ -214,8 +214,6 @@ class LshTable
     }
 
     /** Get a bucket given the key
-     * @param key
-     * @return
      */
     inline const Bucket* getBucketFromKey(BucketKey key) const
     {
@@ -253,7 +251,6 @@ class LshTable
     }
 
     /** Get statistics about the table
-     * @return
      */
     LshStats getStats() const;
 
@@ -427,7 +424,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons
         size_t mask_block = mask_[i / sizeof(size_t)];
         while (mask_block) {
             // Get the lowest set bit in the mask block
-            size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
+            size_t lowest_bit = mask_block & ~(mask_block - 1);
             // Add it to the current subsignature if necessary
             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
             // Reset the bit in the mask block
diff --git a/3rdParty/opencv2/flann/matrix.h b/3rdParty/opencv2/flann/matrix.h
index 82799cedf1..5dc326d63e 100644
--- a/3rdParty/opencv2/flann/matrix.h
+++ b/3rdParty/opencv2/flann/matrix.h
@@ -35,6 +35,9 @@
 
 #include <stdio.h>
 
+#include "opencv2/core/cvdef.h"
+#include "opencv2/flann/defines.h"
+
 namespace cvflann
 {
 
diff --git a/3rdParty/opencv2/flann/params.h b/3rdParty/opencv2/flann/params.h
index f868ddfd87..04aac83a42 100644
--- a/3rdParty/opencv2/flann/params.h
+++ b/3rdParty/opencv2/flann/params.h
@@ -72,11 +72,16 @@ struct SearchParams : public IndexParams
 
 
 template<typename T>
-T get_param(const IndexParams& params, cv::String name, const T& default_value)
+T get_param(const IndexParams& params, const cv::String& name, const T& default_value)
 {
     IndexParams::const_iterator it = params.find(name);
     if (it != params.end()) {
-        return it->second.cast<T>();
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
     }
     else {
         return default_value;
@@ -84,11 +89,16 @@ T get_param(const IndexParams& params, cv::String name, const T& default_value)
 }
 
 template<typename T>
-T get_param(const IndexParams& params, cv::String name)
+T get_param(const IndexParams& params, const cv::String& name)
 {
     IndexParams::const_iterator it = params.find(name);
     if (it != params.end()) {
-        return it->second.cast<T>();
+        try {
+            return it->second.cast<T>();
+        } catch (const std::exception& e) {
+            CV_Error_(cv::Error::StsBadArg,
+                      ("FLANN '%s' param type mismatch: %s", name.c_str(), e.what()));
+        }
     }
     else {
         FLANN_THROW(cv::Error::StsBadArg, cv::String("Missing parameter '")+name+cv::String("' in the parameters given"));
diff --git a/3rdParty/opencv2/flann/random.h b/3rdParty/opencv2/flann/random.h
index 11ceb7646d..4ad50c43a0 100644
--- a/3rdParty/opencv2/flann/random.h
+++ b/3rdParty/opencv2/flann/random.h
@@ -106,7 +106,6 @@ class UniqueRandom
     /**
      * Constructor.
      * @param n Size of the interval from which to generate
-     * @return
      */
     UniqueRandom(int n)
     {
diff --git a/3rdParty/opencv2/flann/result_set.h b/3rdParty/opencv2/flann/result_set.h
index 0ff569923a..08f711c694 100644
--- a/3rdParty/opencv2/flann/result_set.h
+++ b/3rdParty/opencv2/flann/result_set.h
@@ -40,6 +40,9 @@
 #include <set>
 #include <vector>
 
+#include "opencv2/core/base.hpp"
+#include "opencv2/core/cvdef.h"
+
 namespace cvflann
 {
 
@@ -357,7 +360,6 @@ class UniqueResultSet : public ResultSet<DistanceType>
     }
 
     /** The number of neighbors in the set
-     * @return
      */
     size_t size() const
     {
@@ -366,7 +368,6 @@ class UniqueResultSet : public ResultSet<DistanceType>
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
@@ -487,7 +488,6 @@ class RadiusUniqueResultSet : public UniqueResultSet<DistanceType>
 
     /** The distance of the furthest neighbor
      * If we don't have enough neighbors, it returns the max possible value
-     * @return
      */
     inline DistanceType worstDist() const CV_OVERRIDE
     {
diff --git a/3rdParty/opencv2/gapi.hpp b/3rdParty/opencv2/gapi.hpp
index 277f234e92..fa36a92a38 100644
--- a/3rdParty/opencv2/gapi.hpp
+++ b/3rdParty/opencv2/gapi.hpp
@@ -10,7 +10,7 @@
 
 #include <memory>
 
-/** \defgroup gapi G-API framework
+/** \defgroup gapi_ref G-API framework
 @{
     @defgroup gapi_main_classes G-API Main Classes
     @defgroup gapi_data_objects G-API Data Types
diff --git a/3rdParty/opencv2/gapi/core.hpp b/3rdParty/opencv2/gapi/core.hpp
index 44956b990c..370e4ccf86 100644
--- a/3rdParty/opencv2/gapi/core.hpp
+++ b/3rdParty/opencv2/gapi/core.hpp
@@ -635,7 +635,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 */
 GAPI_EXPORTS_W GMat addC(const GMat& src1, const GScalar& c, int ddepth = -1);
 //! @overload
-GAPI_EXPORTS GMat addC(const GScalar& c, const GMat& src1, int ddepth = -1);
+GAPI_EXPORTS_W GMat addC(const GScalar& c, const GMat& src1, int ddepth = -1);
 
 /** @brief Calculates the per-element difference between two matrices.
 
@@ -660,7 +660,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa  add, addC
   */
-GAPI_EXPORTS GMat sub(const GMat& src1, const GMat& src2, int ddepth = -1);
+GAPI_EXPORTS_W GMat sub(const GMat& src1, const GMat& src2, int ddepth = -1);
 
 /** @brief Calculates the per-element difference between matrix and given scalar.
 
@@ -679,7 +679,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa  add, addC, subRC
   */
-GAPI_EXPORTS GMat subC(const GMat& src, const GScalar& c, int ddepth = -1);
+GAPI_EXPORTS_W GMat subC(const GMat& src, const GScalar& c, int ddepth = -1);
 
 /** @brief Calculates the per-element difference between given scalar and the matrix.
 
@@ -698,7 +698,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa  add, addC, subC
   */
-GAPI_EXPORTS GMat subRC(const GScalar& c, const GMat& src, int ddepth = -1);
+GAPI_EXPORTS_W GMat subRC(const GScalar& c, const GMat& src, int ddepth = -1);
 
 /** @brief Calculates the per-element scaled product of two matrices.
 
@@ -719,7 +719,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa add, sub, div, addWeighted
 */
-GAPI_EXPORTS GMat mul(const GMat& src1, const GMat& src2, double scale = 1.0, int ddepth = -1);
+GAPI_EXPORTS_W GMat mul(const GMat& src1, const GMat& src2, double scale = 1.0, int ddepth = -1);
 
 /** @brief Multiplies matrix by scalar.
 
@@ -737,11 +737,11 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
 @sa add, sub, div, addWeighted
 */
-GAPI_EXPORTS GMat mulC(const GMat& src, double multiplier, int ddepth = -1);
+GAPI_EXPORTS_W GMat mulC(const GMat& src, double multiplier, int ddepth = -1);
 //! @overload
-GAPI_EXPORTS GMat mulC(const GMat& src, const GScalar& multiplier, int ddepth = -1);   // FIXME: merge with mulc
+GAPI_EXPORTS_W GMat mulC(const GMat& src, const GScalar& multiplier, int ddepth = -1);   // FIXME: merge with mulc
 //! @overload
-GAPI_EXPORTS GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth = -1);   // FIXME: merge with mulc
+GAPI_EXPORTS_W GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth = -1);   // FIXME: merge with mulc
 
 /** @brief Performs per-element division of two matrices.
 
@@ -764,7 +764,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix; you can only pass -1 when src1.depth() == src2.depth().
 @sa  mul, add, sub
 */
-GAPI_EXPORTS GMat div(const GMat& src1, const GMat& src2, double scale, int ddepth = -1);
+GAPI_EXPORTS_W GMat div(const GMat& src1, const GMat& src2, double scale, int ddepth = -1);
 
 /** @brief Divides matrix by scalar.
 
@@ -785,7 +785,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param scale scale factor.
 @sa add, sub, div, addWeighted
 */
-GAPI_EXPORTS GMat divC(const GMat& src, const GScalar& divisor, double scale, int ddepth = -1);
+GAPI_EXPORTS_W GMat divC(const GMat& src, const GScalar& divisor, double scale, int ddepth = -1);
 
 /** @brief Divides scalar by matrix.
 
@@ -806,7 +806,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param scale scale factor
 @sa add, sub, div, addWeighted
 */
-GAPI_EXPORTS GMat divRC(const GScalar& divident, const GMat& src, double scale, int ddepth = -1);
+GAPI_EXPORTS_W GMat divRC(const GScalar& divident, const GMat& src, double scale, int ddepth = -1);
 
 /** @brief Applies a mask to a matrix.
 
@@ -819,7 +819,7 @@ Supported src matrix data types are @ref CV_8UC1, @ref CV_16SC1, @ref CV_16UC1.
 @param src input matrix.
 @param mask input mask matrix.
 */
-GAPI_EXPORTS GMat mask(const GMat& src, const GMat& mask);
+GAPI_EXPORTS_W GMat mask(const GMat& src, const GMat& mask);
 
 /** @brief Calculates an average (mean) of matrix elements.
 
@@ -854,8 +854,8 @@ Both output must have the same size and depth as input matrices.
 degrees, otherwise, they are measured in radians.
 @sa cartToPolar, exp, log, pow, sqrt
 */
-GAPI_EXPORTS std::tuple<GMat, GMat> polarToCart(const GMat& magnitude, const GMat& angle,
-                                              bool angleInDegrees = false);
+GAPI_EXPORTS_W std::tuple<GMat, GMat> polarToCart(const GMat& magnitude, const GMat& angle,
+                                                  bool angleInDegrees = false);
 
 /** @brief Calculates the magnitude and angle of 2D vectors.
 
@@ -878,8 +878,8 @@ x; the angles are measured in radians (from 0 to 2\*Pi) or in degrees (0 to 360
 in radians (which is by default), or in degrees.
 @sa polarToCart
 */
-GAPI_EXPORTS std::tuple<GMat, GMat> cartToPolar(const GMat& x, const GMat& y,
-                                              bool angleInDegrees = false);
+GAPI_EXPORTS_W std::tuple<GMat, GMat> cartToPolar(const GMat& x, const GMat& y,
+                                                  bool angleInDegrees = false);
 
 /** @brief Calculates the rotation angle of 2D vectors.
 
@@ -896,7 +896,7 @@ same size and the same type as x.
 degrees, otherwise, they are measured in radians.
 @return array of vector angles; it has the same size and same type as x.
 */
-GAPI_EXPORTS GMat phase(const GMat& x, const GMat &y, bool angleInDegrees = false);
+GAPI_EXPORTS_W GMat phase(const GMat& x, const GMat &y, bool angleInDegrees = false);
 
 /** @brief Calculates a square root of array elements.
 
@@ -907,7 +907,7 @@ std::sqrt .
 @param src input floating-point array.
 @return output array of the same size and type as src.
 */
-GAPI_EXPORTS GMat sqrt(const GMat &src);
+GAPI_EXPORTS_W GMat sqrt(const GMat &src);
 
 //! @} gapi_math
 //!
@@ -934,11 +934,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_16UC1, @ref CV_16SC1
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpLE, cmpGE, cmpLT
 */
-GAPI_EXPORTS GMat cmpGT(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpGT(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGTScalar"
 */
-GAPI_EXPORTS GMat cmpGT(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpGT(const GMat& src1, const GScalar& src2);
 
 /** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less than elements in second.
 
@@ -960,11 +960,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpLE, cmpGE, cmpGT
 */
-GAPI_EXPORTS GMat cmpLT(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpLT(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLTScalar"
 */
-GAPI_EXPORTS GMat cmpLT(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpLT(const GMat& src1, const GScalar& src2);
 
 /** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are greater or equal compare to elements in second.
 
@@ -986,11 +986,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpLE, cmpGT, cmpLT
 */
-GAPI_EXPORTS GMat cmpGE(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpGE(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLGEcalar"
 */
-GAPI_EXPORTS GMat cmpGE(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpGE(const GMat& src1, const GScalar& src2);
 
 /** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less or equal compare to elements in second.
 
@@ -1012,11 +1012,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpGT, cmpGE, cmpLT
 */
-GAPI_EXPORTS GMat cmpLE(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpLE(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLEScalar"
 */
-GAPI_EXPORTS GMat cmpLE(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpLE(const GMat& src1, const GScalar& src2);
 
 /** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are equal to elements in second.
 
@@ -1038,11 +1038,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpNE
 */
-GAPI_EXPORTS GMat cmpEQ(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpEQ(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpEQScalar"
 */
-GAPI_EXPORTS GMat cmpEQ(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpEQ(const GMat& src1, const GScalar& src2);
 
 /** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are not equal to elements in second.
 
@@ -1064,11 +1064,11 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix/scalar of the same depth as first input matrix.
 @sa min, max, threshold, cmpEQ
 */
-GAPI_EXPORTS GMat cmpNE(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat cmpNE(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.compare.cmpNEScalar"
 */
-GAPI_EXPORTS GMat cmpNE(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat cmpNE(const GMat& src1, const GScalar& src2);
 
 /** @brief computes bitwise conjunction of the two matrixes (src1 & src2)
 Calculates the per-element bit-wise logical conjunction of two matrices of the same size.
@@ -1086,13 +1086,13 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src1 first input matrix.
 @param src2 second input matrix.
 */
-GAPI_EXPORTS GMat bitwise_and(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat bitwise_and(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.bitwise_andS"
 @param src1 first input matrix.
 @param src2 scalar, which will be per-lemenetly conjuncted with elements of src1.
 */
-GAPI_EXPORTS GMat bitwise_and(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat bitwise_and(const GMat& src1, const GScalar& src2);
 
 /** @brief computes bitwise disjunction of the two matrixes (src1 | src2)
 Calculates the per-element bit-wise logical disjunction of two matrices of the same size.
@@ -1110,13 +1110,13 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src1 first input matrix.
 @param src2 second input matrix.
 */
-GAPI_EXPORTS GMat bitwise_or(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat bitwise_or(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.bitwise_orS"
 @param src1 first input matrix.
 @param src2 scalar, which will be per-lemenetly disjuncted with elements of src1.
 */
-GAPI_EXPORTS GMat bitwise_or(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat bitwise_or(const GMat& src1, const GScalar& src2);
 
 
 /** @brief computes bitwise logical "exclusive or" of the two matrixes (src1 ^ src2)
@@ -1135,13 +1135,13 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src1 first input matrix.
 @param src2 second input matrix.
 */
-GAPI_EXPORTS GMat bitwise_xor(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat bitwise_xor(const GMat& src1, const GMat& src2);
 /** @overload
 @note Function textual ID is "org.opencv.core.pixelwise.bitwise_xorS"
 @param src1 first input matrix.
 @param src2 scalar, for which per-lemenet "logical or" operation on elements of src1 will be performed.
 */
-GAPI_EXPORTS GMat bitwise_xor(const GMat& src1, const GScalar& src2);
+GAPI_EXPORTS_W GMat bitwise_xor(const GMat& src1, const GScalar& src2);
 
 
 /** @brief Inverts every bit of an array.
@@ -1162,7 +1162,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 
 @param src input matrix.
 */
-GAPI_EXPORTS GMat bitwise_not(const GMat& src);
+GAPI_EXPORTS_W GMat bitwise_not(const GMat& src);
 
 /** @brief Select values from either first or second of input matrices by given mask.
 The function set to the output matrix either the value from the first input matrix if corresponding value of mask matrix is 255,
@@ -1178,7 +1178,7 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix.
 @param mask mask input matrix.
 */
-GAPI_EXPORTS GMat select(const GMat& src1, const GMat& src2, const GMat& mask);
+GAPI_EXPORTS_W GMat select(const GMat& src1, const GMat& src2, const GMat& mask);
 
 //! @} gapi_pixelwise
 
@@ -1200,7 +1200,7 @@ Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1,
 @param src2 second input matrix of the same size and depth as src1.
 @sa max, cmpEQ, cmpLT, cmpLE
 */
-GAPI_EXPORTS GMat min(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat min(const GMat& src1, const GMat& src2);
 
 /** @brief Calculates per-element maximum of two matrices.
 
@@ -1217,7 +1217,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src2 second input matrix of the same size and depth as src1.
 @sa min, compare, cmpEQ, cmpGT, cmpGE
 */
-GAPI_EXPORTS GMat max(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat max(const GMat& src1, const GMat& src2);
 
 /** @brief Calculates the per-element absolute difference between two matrices.
 
@@ -1234,7 +1234,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src2 second input matrix.
 @sa abs
 */
-GAPI_EXPORTS GMat absDiff(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat absDiff(const GMat& src1, const GMat& src2);
 
 /** @brief Calculates absolute value of matrix elements.
 
@@ -1251,7 +1251,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param c scalar to be subtracted.
 @sa min, max
 */
-GAPI_EXPORTS GMat absDiffC(const GMat& src, const GScalar& c);
+GAPI_EXPORTS_W GMat absDiffC(const GMat& src, const GScalar& c);
 
 /** @brief Calculates sum of all matrix elements.
 
@@ -1263,7 +1263,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src input matrix.
 @sa countNonZero, mean, min, max
 */
-GAPI_EXPORTS GScalar sum(const GMat& src);
+GAPI_EXPORTS_W GScalar sum(const GMat& src);
 
 /** @brief Counts non-zero array elements.
 
@@ -1276,7 +1276,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_16UC1, @ref CV_16SC1, @ref
 @param src input single-channel matrix.
 @sa  mean, min, max
 */
-GAPI_EXPORTS GOpaque<int> countNonZero(const GMat& src);
+GAPI_EXPORTS_W GOpaque<int> countNonZero(const GMat& src);
 
 /** @brief Calculates the weighted sum of two matrices.
 
@@ -1299,7 +1299,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param ddepth optional depth of the output matrix.
 @sa  add, sub
 */
-GAPI_EXPORTS GMat addWeighted(const GMat& src1, double alpha, const GMat& src2, double beta, double gamma, int ddepth = -1);
+GAPI_EXPORTS_W GMat addWeighted(const GMat& src1, double alpha, const GMat& src2, double beta, double gamma, int ddepth = -1);
 
 /** @brief Calculates the  absolute L1 norm of a matrix.
 
@@ -1322,7 +1322,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src input matrix.
 @sa normL2, normInf
 */
-GAPI_EXPORTS GScalar normL1(const GMat& src);
+GAPI_EXPORTS_W GScalar normL1(const GMat& src);
 
 /** @brief Calculates the absolute L2 norm of a matrix.
 
@@ -1344,7 +1344,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src input matrix.
 @sa normL1, normInf
 */
-GAPI_EXPORTS GScalar normL2(const GMat& src);
+GAPI_EXPORTS_W GScalar normL2(const GMat& src);
 
 /** @brief Calculates the absolute infinite norm of a matrix.
 
@@ -1367,7 +1367,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src input matrix.
 @sa normL1, normL2
 */
-GAPI_EXPORTS GScalar normInf(const GMat& src);
+GAPI_EXPORTS_W GScalar normInf(const GMat& src);
 
 /** @brief Calculates the integral of an image.
 
@@ -1387,7 +1387,7 @@ The function return integral image as \f$(W+1)\times (H+1)\f$ , 32-bit integer o
 CV_64F.
 @param sqdepth desired depth of the integral image of squared pixel values, CV_32F or CV_64F.
  */
-GAPI_EXPORTS std::tuple<GMat, GMat> integral(const GMat& src, int sdepth = -1, int sqdepth = -1);
+GAPI_EXPORTS_W std::tuple<GMat, GMat> integral(const GMat& src, int sdepth = -1, int sqdepth = -1);
 
 /** @brief Applies a fixed-level threshold to each matrix element.
 
@@ -1416,9 +1416,9 @@ types.
 
 @sa min, max, cmpGT, cmpLE, cmpGE, cmpLT
  */
-GAPI_EXPORTS GMat threshold(const GMat& src, const GScalar& thresh, const GScalar& maxval, int type);
+GAPI_EXPORTS_W GMat threshold(const GMat& src, const GScalar& thresh, const GScalar& maxval, int type);
 /** @overload
-This function applicable for all threshold types except CV_THRESH_OTSU and CV_THRESH_TRIANGLE
+This function applicable for all threshold types except cv::THRESH_OTSU and cv::THRESH_TRIANGLE
 @note Function textual ID is "org.opencv.core.matrixop.thresholdOT"
 */
 GAPI_EXPORTS_W std::tuple<GMat, GScalar> threshold(const GMat& src, const GScalar& maxval, int type);
@@ -1438,7 +1438,7 @@ Input and output matrices must be CV_8UC1.
 
 @sa threshold
  */
-GAPI_EXPORTS GMat inRange(const GMat& src, const GScalar& threshLow, const GScalar& threshUp);
+GAPI_EXPORTS_W GMat inRange(const GMat& src, const GScalar& threshLow, const GScalar& threshUp);
 
 //! @} gapi_matrixop
 
@@ -1462,7 +1462,7 @@ The function split4 does the reverse operation.
 @param src4 fourth input @ref CV_8UC1 matrix to be merged.
 @sa merge3, split4, split3
 */
-GAPI_EXPORTS GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4);
+GAPI_EXPORTS_W GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4);
 
 /** @brief Creates one 3-channel matrix out of 3 single-channel ones.
 
@@ -1481,7 +1481,7 @@ The function split3 does the reverse operation.
 @param src3 third input @ref CV_8UC1 matrix to be merged.
 @sa merge4, split4, split3
 */
-GAPI_EXPORTS GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3);
+GAPI_EXPORTS_W GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3);
 
 /** @brief Divides a 4-channel matrix into 4 single-channel matrices.
 
@@ -1498,7 +1498,7 @@ The function merge4 does the reverse operation.
 @param src input @ref CV_8UC4 matrix.
 @sa split3, merge3, merge4
 */
-GAPI_EXPORTS std::tuple<GMat, GMat, GMat,GMat> split4(const GMat& src);
+GAPI_EXPORTS_W std::tuple<GMat, GMat, GMat,GMat> split4(const GMat& src);
 
 /** @brief Divides a 3-channel matrix into 3 single-channel matrices.
 
@@ -1548,9 +1548,9 @@ borderMode=BORDER_TRANSPARENT, it means that the pixels in the destination image
 corresponds to the "outliers" in the source image are not modified by the function.
 @param borderValue Value used in case of a constant border. By default, it is 0.
  */
-GAPI_EXPORTS GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
-                      int interpolation, int borderMode = BORDER_CONSTANT,
-                      const Scalar& borderValue = Scalar());
+GAPI_EXPORTS_W GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
+                          int interpolation, int borderMode = BORDER_CONSTANT,
+                          const Scalar& borderValue = Scalar());
 
 /** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
 
@@ -1587,7 +1587,7 @@ flipping around y-axis. Negative value (for example, -1) means flipping
 around both axes.
 @sa remap
 */
-GAPI_EXPORTS GMat flip(const GMat& src, int flipCode);
+GAPI_EXPORTS_W GMat flip(const GMat& src, int flipCode);
 
 /** @brief Crops a 2D matrix.
 
@@ -1601,7 +1601,7 @@ Output matrix must be of the same depth as input one, size is specified by given
 @param rect a rect to crop a matrix to
 @sa resize
 */
-GAPI_EXPORTS GMat crop(const GMat& src, const Rect& rect);
+GAPI_EXPORTS_W GMat crop(const GMat& src, const Rect& rect);
 
 /** @brief Applies horizontal concatenation to given matrices.
 
@@ -1629,7 +1629,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src2 second input matrix to be considered for horizontal concatenation.
 @sa concatVert
 */
-GAPI_EXPORTS GMat concatHor(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat concatHor(const GMat& src1, const GMat& src2);
 
 /** @overload
 The function horizontally concatenates given number of GMat matrices (with the same number of columns).
@@ -1637,7 +1637,7 @@ Output matrix must the same number of columns and depth as the input matrices, a
 
 @param v vector of input matrices to be concatenated horizontally.
 */
-GAPI_EXPORTS GMat concatHor(const std::vector<GMat> &v);
+GAPI_EXPORTS_W GMat concatHor(const std::vector<GMat> &v);
 
 /** @brief Applies vertical concatenation to given matrices.
 
@@ -1669,7 +1669,7 @@ Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref
 @param src2 second input matrix to be considered for vertical concatenation.
 @sa concatHor
 */
-GAPI_EXPORTS GMat concatVert(const GMat& src1, const GMat& src2);
+GAPI_EXPORTS_W GMat concatVert(const GMat& src1, const GMat& src2);
 
 /** @overload
 The function vertically concatenates given number of GMat matrices (with the same number of columns).
@@ -1677,7 +1677,7 @@ Output matrix must the same number of columns and depth as the input matrices, a
 
 @param v vector of input matrices to be concatenated vertically.
 */
-GAPI_EXPORTS GMat concatVert(const std::vector<GMat> &v);
+GAPI_EXPORTS_W GMat concatVert(const std::vector<GMat> &v);
 
 
 /** @brief Performs a look-up table transform of a matrix.
@@ -1696,7 +1696,7 @@ Output is a matrix of the same size and number of channels as src, and the same
 either have a single channel (in this case the same table is used for all channels) or the same
 number of channels as in the input matrix.
 */
-GAPI_EXPORTS GMat LUT(const GMat& src, const Mat& lut);
+GAPI_EXPORTS_W GMat LUT(const GMat& src, const Mat& lut);
 
 /** @brief Converts a matrix to another data depth with optional scaling.
 
@@ -1713,7 +1713,7 @@ same as the input has; if rdepth is negative, the output matrix will have the sa
 @param alpha optional scale factor.
 @param beta optional delta added to the scaled values.
  */
-GAPI_EXPORTS GMat convertTo(const GMat& src, int rdepth, double alpha=1, double beta=0);
+GAPI_EXPORTS_W GMat convertTo(const GMat& src, int rdepth, double alpha=1, double beta=0);
 
 /** @brief Normalizes the norm or value range of an array.
 
@@ -1735,8 +1735,8 @@ normalization.
 number of channels as src and the depth =ddepth.
 @sa norm, Mat::convertTo
 */
-GAPI_EXPORTS GMat normalize(const GMat& src, double alpha, double beta,
-                            int norm_type, int ddepth = -1);
+GAPI_EXPORTS_W GMat normalize(const GMat& src, double alpha, double beta,
+                              int norm_type, int ddepth = -1);
 
 /** @brief Applies a perspective transformation to an image.
 
@@ -1759,8 +1759,8 @@ optional flag #WARP_INVERSE_MAP, that sets M as the inverse transformation (
 
 @sa  warpAffine, resize, remap, getRectSubPix, perspectiveTransform
  */
-GAPI_EXPORTS GMat warpPerspective(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
-                                  int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
+GAPI_EXPORTS_W GMat warpPerspective(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
+                                    int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
 
 /** @brief Applies an affine transformation to an image.
 
@@ -1784,8 +1784,8 @@ borderMode=#BORDER_TRANSPARENT isn't supported
 
 @sa  warpPerspective, resize, remap, getRectSubPix, transform
  */
-GAPI_EXPORTS GMat warpAffine(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
-                             int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
+GAPI_EXPORTS_W GMat warpAffine(const GMat& src, const Mat& M, const Size& dsize, int flags = cv::INTER_LINEAR,
+                               int borderMode = cv::BORDER_CONSTANT, const Scalar& borderValue = Scalar());
 //! @} gapi_transform
 
 /** @brief Finds centers of clusters and groups input samples around the clusters.
@@ -1834,7 +1834,7 @@ compactness value are returned by the function.
  - Integer array that stores the cluster indices for every sample.
  - Array of the cluster centers.
 */
-GAPI_EXPORTS std::tuple<GOpaque<double>,GMat,GMat>
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GMat,GMat>
 kmeans(const GMat& data, const int K, const GMat& bestLabels,
        const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
 
@@ -1857,7 +1857,7 @@ kmeans(const GArray<Point2f>& data, const int K, const GArray<int>& bestLabels,
 /** @overload
 @note Function textual ID is "org.opencv.core.kmeans3D"
  */
-GAPI_EXPORTS std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
+GAPI_EXPORTS_W std::tuple<GOpaque<double>,GArray<int>,GArray<Point3f>>
 kmeans(const GArray<Point3f>& data, const int K, const GArray<int>& bestLabels,
        const TermCriteria& criteria, const int attempts, const KmeansFlags flags);
 
@@ -1873,7 +1873,7 @@ The function transposes the matrix:
 
 @param src input array.
 */
-GAPI_EXPORTS GMat transpose(const GMat& src);
+GAPI_EXPORTS_W GMat transpose(const GMat& src);
 
 
 namespace streaming {
@@ -1903,7 +1903,7 @@ GAPI_EXPORTS_W GOpaque<Size> size(const GOpaque<Rect>& r);
 @param src Input frame
 @return Size (frame dimensions).
 */
-GAPI_EXPORTS GOpaque<Size> size(const GFrame& src);
+GAPI_EXPORTS_W GOpaque<Size> size(const GFrame& src);
 } //namespace streaming
 } //namespace gapi
 } //namespace cv
diff --git a/3rdParty/opencv2/gapi/cpu/gcpukernel.hpp b/3rdParty/opencv2/gapi/cpu/gcpukernel.hpp
index e81198a7a5..86071bf7c4 100644
--- a/3rdParty/opencv2/gapi/cpu/gcpukernel.hpp
+++ b/3rdParty/opencv2/gapi/cpu/gcpukernel.hpp
@@ -8,9 +8,9 @@
 #ifndef OPENCV_GAPI_GCPUKERNEL_HPP
 #define OPENCV_GAPI_GCPUKERNEL_HPP
 
-#ifdef _MSC_VER
-#pragma warning(disable: 4702)  // "Unreachable code"
-// on postprocess(...) call inside OCVCallHelper
+#if defined _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702)  // "Unreachable code" on postprocess(...) call inside OCVCallHelper
 #endif
 
 #include <functional>
@@ -535,4 +535,8 @@ gapi::cpu::GOCVFunctor gapi::cpu::ocv_kernel(const Callable& c)
 
 } // namespace cv
 
+#if defined _MSC_VER
+#pragma warning(pop)
+#endif
+
 #endif // OPENCV_GAPI_GCPUKERNEL_HPP
diff --git a/3rdParty/opencv2/gapi/cpu/ot.hpp b/3rdParty/opencv2/gapi/cpu/ot.hpp
new file mode 100644
index 0000000000..03dbe904cc
--- /dev/null
+++ b/3rdParty/opencv2/gapi/cpu/ot.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_OT_API_HPP
+#define OPENCV_GAPI_CPU_OT_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+namespace cpu {
+GAPI_EXPORTS_W GKernelPackage kernels();
+} // namespace cpu
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_OT_API_HPP
diff --git a/3rdParty/opencv2/gapi/fluid/gfluidkernel.hpp b/3rdParty/opencv2/gapi/fluid/gfluidkernel.hpp
index cc5341575b..87d42a6fc5 100644
--- a/3rdParty/opencv2/gapi/fluid/gfluidkernel.hpp
+++ b/3rdParty/opencv2/gapi/fluid/gfluidkernel.hpp
@@ -248,11 +248,11 @@ struct scratch_helper<false, Impl, Ins...>
                           const cv::GArgs     &,
                           gapi::fluid::Buffer &)
     {
-        GAPI_Assert(false);
+        GAPI_Error("InternalError");
     }
     static void help_reset(gapi::fluid::Buffer &)
     {
-        GAPI_Assert(false);
+        GAPI_Error("InternalError");
     }
 };
 
diff --git a/3rdParty/opencv2/gapi/fluid/imgproc.hpp b/3rdParty/opencv2/gapi/fluid/imgproc.hpp
index 385390f1b4..31fc2446dc 100644
--- a/3rdParty/opencv2/gapi/fluid/imgproc.hpp
+++ b/3rdParty/opencv2/gapi/fluid/imgproc.hpp
@@ -13,7 +13,7 @@
 
 namespace cv { namespace gapi { namespace imgproc { namespace fluid {
 
-GAPI_EXPORTS GKernelPackage kernels();
+GAPI_EXPORTS_W GKernelPackage kernels();
 
 }}}}
 
diff --git a/3rdParty/opencv2/gapi/garg.hpp b/3rdParty/opencv2/gapi/garg.hpp
index 063f4bc994..3c06d851f0 100644
--- a/3rdParty/opencv2/gapi/garg.hpp
+++ b/3rdParty/opencv2/gapi/garg.hpp
@@ -241,6 +241,7 @@ namespace gapi
  *
  * @brief G-API functions and classes for serialization and deserialization.
  */
+
 /** @brief Wraps deserialized output GRunArgs to GRunArgsP which can be used by GCompiled.
  *
  * Since it's impossible to get modifiable output arguments from deserialization
@@ -254,6 +255,7 @@ namespace gapi
  * @see deserialize
  */
 GAPI_EXPORTS cv::GRunArgsP bind(cv::GRunArgs &out_args);
+
 /** @brief Wraps output GRunArgsP available during graph execution to GRunArgs which can be serialized.
  *
  * GRunArgsP is pointer-to-value, so to be serialized they need to be binded to real values
diff --git a/3rdParty/opencv2/gapi/garray.hpp b/3rdParty/opencv2/gapi/garray.hpp
index 81693715f2..f04ca521d8 100644
--- a/3rdParty/opencv2/gapi/garray.hpp
+++ b/3rdParty/opencv2/gapi/garray.hpp
@@ -102,17 +102,17 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = typename std::decay<T>::type;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GArrayU::specifyType(){
         m_hint.reset(new TypeHint<typename std::decay<T>::type>);
-    };
+    }
 
     template <typename T>
     void GArrayU::storeKind(){
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed STL vector reference.
     // Depending on origins, this reference may be either "just a" reference to
@@ -177,7 +177,7 @@ namespace detail
             {
                 util::get<rw_own_t>(m_ref).clear();
             }
-            else GAPI_Assert(false); // shouldn't be called in *EXT modes
+            else GAPI_Error("InternalError"); // shouldn't be called in *EXT modes
         }
 
         // Obtain a WRITE reference to underlying object
diff --git a/3rdParty/opencv2/gapi/gcommon.hpp b/3rdParty/opencv2/gapi/gcommon.hpp
index 9503e1e8e6..97bd9cb2dc 100644
--- a/3rdParty/opencv2/gapi/gcommon.hpp
+++ b/3rdParty/opencv2/gapi/gcommon.hpp
@@ -51,6 +51,7 @@ namespace detail
         CV_STRING,     // std::string user G-API data
         CV_POINT,      // cv::Point user G-API data
         CV_POINT2F,    // cv::Point2f user G-API data
+        CV_POINT3F,    // cv::Point3f user G-API data
         CV_SIZE,       // cv::Size user G-API data
         CV_RECT,       // cv::Rect user G-API data
         CV_SCALAR,     // cv::Scalar user G-API data
@@ -72,16 +73,17 @@ namespace detail
     template<> struct GOpaqueTraits<cv::Scalar>  { static constexpr const OpaqueKind kind = OpaqueKind::CV_SCALAR; };
     template<> struct GOpaqueTraits<cv::Point>   { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT; };
     template<> struct GOpaqueTraits<cv::Point2f> { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT2F; };
+    template<> struct GOpaqueTraits<cv::Point3f> { static constexpr const OpaqueKind kind = OpaqueKind::CV_POINT3F; };
     template<> struct GOpaqueTraits<cv::Mat>     { static constexpr const OpaqueKind kind = OpaqueKind::CV_MAT; };
     template<> struct GOpaqueTraits<cv::Rect>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_RECT; };
     template<> struct GOpaqueTraits<cv::GMat>    { static constexpr const OpaqueKind kind = OpaqueKind::CV_MAT; };
     template<> struct GOpaqueTraits<cv::gapi::wip::draw::Prim>
                                                  { static constexpr const OpaqueKind kind = OpaqueKind::CV_DRAW_PRIM; };
     using GOpaqueTraitsArrayTypes = std::tuple<int, double, float, uint64_t, bool, std::string, cv::Size, cv::Scalar, cv::Point, cv::Point2f,
-                                               cv::Mat, cv::Rect, cv::gapi::wip::draw::Prim>;
+                                               cv::Point3f, cv::Mat, cv::Rect, cv::gapi::wip::draw::Prim>;
     // GOpaque is not supporting cv::Mat and cv::Scalar since there are GScalar and GMat types
-    using GOpaqueTraitsOpaqueTypes = std::tuple<int, double, float, uint64_t, bool, std::string, cv::Size, cv::Point, cv::Point2f, cv::Rect,
-                                                cv::gapi::wip::draw::Prim>;
+    using GOpaqueTraitsOpaqueTypes = std::tuple<int, double, float, uint64_t, bool, std::string, cv::Size, cv::Point, cv::Point2f, cv::Point3f,
+                                                cv::Rect, cv::gapi::wip::draw::Prim>;
 } // namespace detail
 
 // This definition is here because it is reused by both public(?) and internal
@@ -247,6 +249,8 @@ template<typename T> struct wrap_serialize
 } // namespace s11n
 } // namespace gapi
 
+/** @} gapi_compile_args */
+
 /**
  * @brief Ask G-API to dump compiled graph in Graphviz format under
  * the given file name.
@@ -259,7 +263,20 @@ struct graph_dump_path
 {
     std::string m_dump_path;
 };
-/** @} */
+
+/**
+ * @brief Ask G-API to use threaded executor when cv::GComputation
+ * is compiled via cv::GComputation::compile method.
+ *
+ * Specifies a number of threads that should be used by executor.
+ */
+struct GAPI_EXPORTS use_threaded_executor
+{
+    use_threaded_executor();
+    explicit use_threaded_executor(const uint32_t nthreads);
+
+    uint32_t num_threads;
+};
 
 namespace detail
 {
@@ -267,6 +284,11 @@ namespace detail
     {
         static const char* tag() { return "gapi.graph_dump_path"; }
     };
+
+    template<> struct CompileArgTag<cv::use_threaded_executor>
+    {
+        static const char* tag() { return "gapi.threaded_executor"; }
+    };
 }
 
 } // namespace cv
diff --git a/3rdParty/opencv2/gapi/gcomputation.hpp b/3rdParty/opencv2/gapi/gcomputation.hpp
index a06475908d..4803c7d999 100644
--- a/3rdParty/opencv2/gapi/gcomputation.hpp
+++ b/3rdParty/opencv2/gapi/gcomputation.hpp
@@ -50,6 +50,7 @@ namespace s11n {
  *
  * @brief G-API classes for constructed and compiled graphs.
  */
+
 /**
  * @brief GComputation class represents a captured computation
  * graph. GComputation objects form boundaries for expression code
diff --git a/3rdParty/opencv2/gapi/gkernel.hpp b/3rdParty/opencv2/gapi/gkernel.hpp
index 6e418d921f..2cfd0fb877 100644
--- a/3rdParty/opencv2/gapi/gkernel.hpp
+++ b/3rdParty/opencv2/gapi/gkernel.hpp
@@ -51,6 +51,7 @@ struct GAPI_EXPORTS GKernel
     GShapes     outShapes;  // types (shapes) kernel's outputs
     GKinds      inKinds;    // kinds of kernel's inputs (fixme: below)
     GCtors      outCtors;   // captured constructors for template output types
+    GKinds      outKinds;   // kinds of kernel's outputs (fixme: below)
 };
 // TODO: It's questionable if inKinds should really be here. Instead,
 // this information could come from meta.
@@ -227,7 +228,8 @@ class GKernelTypeM<K, std::function<std::tuple<R...>(Args...)> >
                               , &K::getOutMeta
                               , {detail::GTypeTraits<R>::shape...}
                               , {detail::GTypeTraits<Args>::op_kind...}
-                              , {detail::GObtainCtor<R>::get()...}});
+                              , {detail::GObtainCtor<R>::get()...}
+                              , {detail::GTypeTraits<R>::op_kind...}});
         call.pass(args...); // TODO: std::forward() here?
         return yield(call, typename detail::MkSeq<sizeof...(R)>::type());
     }
@@ -251,7 +253,8 @@ class GKernelType<K, std::function<R(Args...)> >
                               , &K::getOutMeta
                               , {detail::GTypeTraits<R>::shape}
                               , {detail::GTypeTraits<Args>::op_kind...}
-                              , {detail::GObtainCtor<R>::get()}});
+                              , {detail::GObtainCtor<R>::get()}
+                              , {detail::GTypeTraits<R>::op_kind}});
         call.pass(args...);
         return detail::Yield<R>::yield(call, 0);
     }
@@ -414,8 +417,8 @@ namespace cv {
     class GAPI_EXPORTS_W_SIMPLE GKernelPackage;
 
 namespace gapi {
-    GAPI_EXPORTS cv::GKernelPackage combine(const cv::GKernelPackage  &lhs,
-                                            const cv::GKernelPackage  &rhs);
+    GAPI_EXPORTS_W cv::GKernelPackage combine(const cv::GKernelPackage  &lhs,
+                                              const cv::GKernelPackage  &rhs);
 
     /// @private
     class GFunctor
@@ -427,7 +430,7 @@ namespace gapi {
 
         virtual ~GFunctor() = default;
     protected:
-        GFunctor(const char* id) : m_id(id) { };
+        GFunctor(const char* id) : m_id(id) { }
     private:
         const char* m_id;
     };
@@ -513,7 +516,7 @@ namespace gapi {
          *
          * @return a number of kernels in the package
          */
-        std::size_t size() const;
+        GAPI_WRAP std::size_t size() const;
 
         /**
          * @brief Returns vector of transformations included in the package
@@ -689,7 +692,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include<KK>(), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     template<typename... FF>
     GKernelPackage kernels(FF&... functors)
@@ -698,7 +701,7 @@ namespace gapi {
         int unused[] = { 0, (pkg.include(functors), 0)... };
         cv::util::suppress_unused_warning(unused);
         return pkg;
-    };
+    }
 
     /** @} */
 
@@ -717,6 +720,8 @@ namespace gapi {
     {
         return combine(a, combine(b, rest...));
     }
+    // NB(DM): Variadic-arg version in Python may require the same
+    // approach as used in GComputation::compile/apply.
 
     /** \addtogroup gapi_compile_args
      * @{
diff --git a/3rdParty/opencv2/gapi/gmat.hpp b/3rdParty/opencv2/gapi/gmat.hpp
index 77bdd27b4e..27ac65e78f 100644
--- a/3rdParty/opencv2/gapi/gmat.hpp
+++ b/3rdParty/opencv2/gapi/gmat.hpp
@@ -48,6 +48,7 @@ struct GOrigin;
  *    `cv::GOpaque<T>`   | T
  *    cv::GFrame         | cv::MediaFrame
  */
+
 /**
  * @brief GMat class represents image or tensor data in the
  * graph.
@@ -76,6 +77,18 @@ class GAPI_EXPORTS_W_SIMPLE GMat
      */
     GAPI_WRAP GMat();                       // Empty constructor
 
+    /**
+     * @brief Constructs a value-initialized GMat
+     *
+     * GMat may be associated with a buffer at graph construction time.
+     * It is useful when some operation has a Mat input which doesn't
+     * change during the program execution, and is set only once.
+     * In this case, there's no need to declare such GMat as graph input.
+     *
+     * @param m a cv::Mat buffer to associate with this GMat object.
+     */
+    GAPI_WRAP explicit GMat(cv::Mat m);     // Value-initialization constructor
+
     /// @private
     GMat(const GNode &n, std::size_t out);  // Operation result constructor
     /// @private
diff --git a/3rdParty/opencv2/gapi/gopaque.hpp b/3rdParty/opencv2/gapi/gopaque.hpp
index 387f05a245..207556f0b7 100644
--- a/3rdParty/opencv2/gapi/gopaque.hpp
+++ b/3rdParty/opencv2/gapi/gopaque.hpp
@@ -98,18 +98,18 @@ namespace detail
         GAPI_Assert(m_hint != nullptr);
         using U = util::decay_t<T>;
         return dynamic_cast<TypeHint<U>*>(m_hint.get()) != nullptr;
-    };
+    }
 
     template <typename T>
     void GOpaqueU::specifyType(){
         m_hint.reset(new TypeHint<util::decay_t<T>>);
-    };
+    }
 
     template <typename T>
     void GOpaqueU::storeKind(){
         // FIXME: Add assert here on cv::Mat and cv::Scalar?
         setKind(cv::detail::GOpaqueTraits<T>::kind);
-    };
+    }
 
     // This class represents a typed object reference.
     // Depending on origins, this reference may be either "just a" reference to
@@ -171,7 +171,7 @@ namespace detail
             {
                 util::get<rw_own_t>(m_ref) = {};
             }
-            else GAPI_Assert(false); // shouldn't be called in *EXT modes
+            else GAPI_Error("InternalError"); // shouldn't be called in *EXT modes
         }
 
         // Obtain a WRITE reference to underlying object
diff --git a/3rdParty/opencv2/gapi/gscalar.hpp b/3rdParty/opencv2/gapi/gscalar.hpp
index 7697b10548..03a70dfc32 100644
--- a/3rdParty/opencv2/gapi/gscalar.hpp
+++ b/3rdParty/opencv2/gapi/gscalar.hpp
@@ -54,12 +54,11 @@ class GAPI_EXPORTS_W_SIMPLE GScalar
     /**
      * @brief Constructs a value-initialized GScalar
      *
-     * In contrast with GMat (which can be either an explicit graph input
-     * or a result of some operation), GScalars may have their values
-     * be associated at graph construction time. It is useful when
-     * some operation has a GScalar input which doesn't change during
-     * the program execution, and is set only once. In this case,
-     * there is no need to declare such GScalar as a graph input.
+     * GScalars may have their values be associated at graph
+     * construction time. It is useful when some operation has a
+     * GScalar input which doesn't change during the program
+     * execution, and is set only once. In this case, there is no need
+     * to declare such GScalar as a graph input.
      *
      * @note The value of GScalar may be overwritten by assigning some
      * other GScalar to the object using `operator=` -- on the
@@ -67,6 +66,7 @@ class GAPI_EXPORTS_W_SIMPLE GScalar
      *
      * @param s a cv::Scalar value to associate with this GScalar object.
      */
+    GAPI_WRAP
     explicit GScalar(const cv::Scalar& s);
 
     /**
diff --git a/3rdParty/opencv2/gapi/gstreaming.hpp b/3rdParty/opencv2/gapi/gstreaming.hpp
index 002967f5fb..8d4e15aa99 100644
--- a/3rdParty/opencv2/gapi/gstreaming.hpp
+++ b/3rdParty/opencv2/gapi/gstreaming.hpp
@@ -388,7 +388,6 @@ class GAPI_EXPORTS_W_SIMPLE GStreamingCompiled
     /// @private
     std::shared_ptr<Priv> m_priv;
 };
-/** @} */
 
 namespace gapi {
 
@@ -409,11 +408,10 @@ namespace streaming {
 struct GAPI_EXPORTS_W_SIMPLE queue_capacity
 {
     GAPI_WRAP
-    explicit queue_capacity(size_t cap = 1) : capacity(cap) { };
+    explicit queue_capacity(size_t cap = 1) : capacity(cap) { }
     GAPI_PROP_RW
     size_t capacity;
 };
-/** @} */
 } // namespace streaming
 } // namespace gapi
 
@@ -425,6 +423,8 @@ template<> struct CompileArgTag<cv::gapi::streaming::queue_capacity>
 };
 }
 
+/** @} gapi_main_classes */
+
 }
 
 #endif // OPENCV_GAPI_GSTREAMING_COMPILED_HPP
diff --git a/3rdParty/opencv2/gapi/gtransform.hpp b/3rdParty/opencv2/gapi/gtransform.hpp
index bc9e509eb1..ae904d8164 100644
--- a/3rdParty/opencv2/gapi/gtransform.hpp
+++ b/3rdParty/opencv2/gapi/gtransform.hpp
@@ -91,7 +91,7 @@ class GTransformImpl<K, std::function<R(Args...)>> : public cv::detail::TransHel
     {                                                           \
     struct G_DESCR_HELPER_CLASS(Class)                          \
     {                                                           \
-        static constexpr const char *descr() { return Descr; }; \
+        static constexpr const char *descr() { return Descr; }  \
     };                                                          \
     }
 
diff --git a/3rdParty/opencv2/gapi/gtype_traits.hpp b/3rdParty/opencv2/gapi/gtype_traits.hpp
index 35e99c9282..cbf5f8f417 100644
--- a/3rdParty/opencv2/gapi/gtype_traits.hpp
+++ b/3rdParty/opencv2/gapi/gtype_traits.hpp
@@ -141,8 +141,10 @@ namespace detail
     template<typename U> struct GTypeOf<std::vector<U> >       { using type = cv::GArray<U>; };
     template<typename U> struct GTypeOf                        { using type = cv::GOpaque<U>;};
     template<>           struct GTypeOf<cv::MediaFrame>        { using type = cv::GFrame;    };
-    // FIXME: This is not quite correct since IStreamSource may produce not only Mat but also Scalar
-    // and vector data. TODO: Extend the type dispatching on these types too.
+
+    // FIXME: This is not quite correct since IStreamSource may
+    // produce not only Mat but also MediaFrame, Scalar and vector
+    // data. TODO: Extend the type dispatching on these types too.
     template<>           struct GTypeOf<cv::gapi::wip::IStreamSource::Ptr> { using type = cv::GMat;};
     template<class T> using g_type_of_t = typename GTypeOf<T>::type;
 
@@ -229,10 +231,10 @@ template<typename T> struct GObtainCtor {
     static HostCtor get() { return HostCtor{}; }
 };
 template<typename T> struct GObtainCtor<GArray<T> > {
-    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; };
+    static HostCtor get() { return HostCtor{ConstructVec{&GArray<T>::VCtor}}; }
 };
 template<typename T> struct GObtainCtor<GOpaque<T> > {
-    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; };
+    static HostCtor get() { return HostCtor{ConstructOpaque{&GOpaque<T>::Ctor}}; }
 };
 } // namespace detail
 } // namespace cv
diff --git a/3rdParty/opencv2/gapi/gtyped.hpp b/3rdParty/opencv2/gapi/gtyped.hpp
index 655be1748d..0bd1cdf7a9 100644
--- a/3rdParty/opencv2/gapi/gtyped.hpp
+++ b/3rdParty/opencv2/gapi/gtyped.hpp
@@ -40,7 +40,7 @@ namespace detail
     //workaround for MSVC 19.0 bug
     template <typename T>
     auto make_default()->decltype(T{}) {return {};}
-}; // detail
+} // detail
 
 /**
  * @brief This class is a typed wrapper over a regular GComputation.
diff --git a/3rdParty/opencv2/gapi/imgproc.hpp b/3rdParty/opencv2/gapi/imgproc.hpp
index 0a4a31e03d..67a34fbbc9 100644
--- a/3rdParty/opencv2/gapi/imgproc.hpp
+++ b/3rdParty/opencv2/gapi/imgproc.hpp
@@ -556,9 +556,9 @@ is at the kernel center.
 @param borderValue border value in case of constant border type
 @sa  boxFilter, gaussianBlur, medianBlur
  */
-GAPI_EXPORTS GMat sepFilter(const GMat& src, int ddepth, const Mat& kernelX, const Mat& kernelY, const Point& anchor /*FIXME: = Point(-1,-1)*/,
-                            const Scalar& delta /*FIXME = GScalar(0)*/, int borderType = BORDER_DEFAULT,
-                            const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat sepFilter(const GMat& src, int ddepth, const Mat& kernelX, const Mat& kernelY, const Point& anchor /*FIXME: = Point(-1,-1)*/,
+                              const Scalar& delta /*FIXME = GScalar(0)*/, int borderType = BORDER_DEFAULT,
+                              const Scalar& borderValue = Scalar(0));
 
 /** @brief Convolves an image with the kernel.
 
@@ -593,8 +593,8 @@ is at the kernel center.
 @param borderValue border value in case of constant border type
 @sa  sepFilter
  */
-GAPI_EXPORTS GMat filter2D(const GMat& src, int ddepth, const Mat& kernel, const Point& anchor = Point(-1,-1), const Scalar& delta = Scalar(0),
-                           int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat filter2D(const GMat& src, int ddepth, const Mat& kernel, const Point& anchor = Point(-1,-1), const Scalar& delta = Scalar(0),
+                             int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
 
 
 /** @brief Blurs an image using the box filter.
@@ -627,9 +627,9 @@ is at the kernel center.
 @param borderValue border value in case of constant border type
 @sa  sepFilter, gaussianBlur, medianBlur, integral
  */
-GAPI_EXPORTS GMat boxFilter(const GMat& src, int dtype, const Size& ksize, const Point& anchor = Point(-1,-1),
-                            bool normalize = true, int borderType = BORDER_DEFAULT,
-                            const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat boxFilter(const GMat& src, int dtype, const Size& ksize, const Point& anchor = Point(-1,-1),
+                              bool normalize = true, int borderType = BORDER_DEFAULT,
+                              const Scalar& borderValue = Scalar(0));
 
 /** @brief Blurs an image using the normalized box filter.
 
@@ -654,8 +654,8 @@ center.
 @param borderValue border value in case of constant border type
 @sa  boxFilter, bilateralFilter, GaussianBlur, medianBlur
  */
-GAPI_EXPORTS GMat blur(const GMat& src, const Size& ksize, const Point& anchor = Point(-1,-1),
-                       int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat blur(const GMat& src, const Size& ksize, const Point& anchor = Point(-1,-1),
+                         int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
 
 
 //GAPI_EXPORTS_W void blur( InputArray src, OutputArray dst,
@@ -687,8 +687,8 @@ sigmaX, and sigmaY.
 @param borderValue border value in case of constant border type
 @sa  sepFilter, boxFilter, medianBlur
  */
-GAPI_EXPORTS GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX, double sigmaY = 0,
-                               int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX, double sigmaY = 0,
+                                 int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
 
 /** @brief Blurs an image using the median filter.
 
@@ -730,9 +730,9 @@ anchor is at the element center.
 @param borderValue border value in case of a constant border
 @sa  dilate, morphologyEx
  */
-GAPI_EXPORTS GMat erode(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
-                        int borderType = BORDER_CONSTANT,
-                        const  Scalar& borderValue = morphologyDefaultBorderValue());
+GAPI_EXPORTS_W GMat erode(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                          int borderType = BORDER_CONSTANT,
+                          const  Scalar& borderValue = morphologyDefaultBorderValue());
 
 /** @brief Erodes an image by using 3 by 3 rectangular structuring element.
 
@@ -750,9 +750,9 @@ Output image must have the same type, size, and number of channels as the input
 @param borderValue border value in case of a constant border
 @sa  erode, dilate3x3
  */
-GAPI_EXPORTS GMat erode3x3(const GMat& src, int iterations = 1,
-                           int borderType = BORDER_CONSTANT,
-                           const  Scalar& borderValue = morphologyDefaultBorderValue());
+GAPI_EXPORTS_W GMat erode3x3(const GMat& src, int iterations = 1,
+                             int borderType = BORDER_CONSTANT,
+                             const  Scalar& borderValue = morphologyDefaultBorderValue());
 
 /** @brief Dilates an image by using a specific structuring element.
 
@@ -777,9 +777,9 @@ anchor is at the element center.
 @param borderValue border value in case of a constant border
 @sa  erode, morphologyEx, getStructuringElement
  */
-GAPI_EXPORTS GMat dilate(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
-                         int borderType = BORDER_CONSTANT,
-                         const  Scalar& borderValue = morphologyDefaultBorderValue());
+GAPI_EXPORTS_W GMat dilate(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                           int borderType = BORDER_CONSTANT,
+                           const  Scalar& borderValue = morphologyDefaultBorderValue());
 
 /** @brief Dilates an image by using 3 by 3 rectangular structuring element.
 
@@ -801,9 +801,9 @@ Output image must have the same type, size, and number of channels as the input
 @sa  dilate, erode3x3
  */
 
-GAPI_EXPORTS GMat dilate3x3(const GMat& src, int iterations = 1,
-                            int borderType = BORDER_CONSTANT,
-                            const  Scalar& borderValue = morphologyDefaultBorderValue());
+GAPI_EXPORTS_W GMat dilate3x3(const GMat& src, int iterations = 1,
+                              int borderType = BORDER_CONSTANT,
+                              const  Scalar& borderValue = morphologyDefaultBorderValue());
 
 /** @brief Performs advanced morphological transformations.
 
@@ -831,11 +831,11 @@ the kernel center.
 meaning.
 @sa  dilate, erode, getStructuringElement
  */
-GAPI_EXPORTS GMat morphologyEx(const GMat &src, const MorphTypes op, const Mat &kernel,
-                               const Point       &anchor      = Point(-1,-1),
-                               const int          iterations  = 1,
-                               const BorderTypes  borderType  = BORDER_CONSTANT,
-                               const Scalar      &borderValue = morphologyDefaultBorderValue());
+GAPI_EXPORTS_W GMat morphologyEx(const GMat &src, const MorphTypes op, const Mat &kernel,
+                                 const Point       &anchor      = Point(-1,-1),
+                                 const int          iterations  = 1,
+                                 const BorderTypes  borderType  = BORDER_CONSTANT,
+                                 const Scalar      &borderValue = morphologyDefaultBorderValue());
 
 /** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
 
@@ -883,10 +883,10 @@ applied (see cv::getDerivKernels for details).
 @param borderValue border value in case of constant border type
 @sa filter2D, gaussianBlur, cartToPolar
  */
-GAPI_EXPORTS GMat Sobel(const GMat& src, int ddepth, int dx, int dy, int ksize = 3,
-                        double scale = 1, double delta = 0,
-                        int borderType = BORDER_DEFAULT,
-                        const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W GMat Sobel(const GMat& src, int ddepth, int dx, int dy, int ksize = 3,
+                          double scale = 1, double delta = 0,
+                          int borderType = BORDER_DEFAULT,
+                          const Scalar& borderValue = Scalar(0));
 
 /** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
 
@@ -934,10 +934,10 @@ applied (see cv::getDerivKernels for details).
 @param borderValue border value in case of constant border type
 @sa filter2D, gaussianBlur, cartToPolar
  */
-GAPI_EXPORTS std::tuple<GMat, GMat> SobelXY(const GMat& src, int ddepth, int order, int ksize = 3,
-                        double scale = 1, double delta = 0,
-                        int borderType = BORDER_DEFAULT,
-                        const Scalar& borderValue = Scalar(0));
+GAPI_EXPORTS_W std::tuple<GMat, GMat> SobelXY(const GMat& src, int ddepth, int order, int ksize = 3,
+                                              double scale = 1, double delta = 0,
+                                              int borderType = BORDER_DEFAULT,
+                                              const Scalar& borderValue = Scalar(0));
 
 /** @brief Calculates the Laplacian of an image.
 
@@ -964,8 +964,8 @@ applied. See #getDerivKernels for details.
 @return Destination image of the same size and the same number of channels as src.
 @sa  Sobel, Scharr
  */
-GAPI_EXPORTS GMat Laplacian(const GMat& src, int ddepth, int ksize = 1,
-                            double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT);
+GAPI_EXPORTS_W GMat Laplacian(const GMat& src, int ddepth, int ksize = 1,
+                              double scale = 1, double delta = 0, int borderType = BORDER_DEFAULT);
 
 /** @brief Applies the bilateral filter to an image.
 
@@ -998,8 +998,8 @@ proportional to sigmaSpace.
 @param borderType border mode used to extrapolate pixels outside of the image, see #BorderTypes
 @return Destination image of the same size and type as src.
  */
-GAPI_EXPORTS GMat bilateralFilter(const GMat& src, int d, double sigmaColor, double sigmaSpace,
-                                  int borderType = BORDER_DEFAULT);
+GAPI_EXPORTS_W GMat bilateralFilter(const GMat& src, int d, double sigmaColor, double sigmaSpace,
+                                    int borderType = BORDER_DEFAULT);
 
 //! @} gapi_filters
 
@@ -1023,8 +1023,8 @@ largest value is used to find initial segments of strong edges. See
 L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
 L2gradient=false ).
  */
-GAPI_EXPORTS GMat Canny(const GMat& image, double threshold1, double threshold2,
-                        int apertureSize = 3, bool L2gradient = false);
+GAPI_EXPORTS_W GMat Canny(const GMat& image, double threshold1, double threshold2,
+                          int apertureSize = 3, bool L2gradient = false);
 
 /** @brief Determines strong corners on an image.
 
@@ -1070,14 +1070,14 @@ or #cornerMinEigenVal.
 
 @return vector of detected corners.
  */
-GAPI_EXPORTS_W GArray<Point2f> goodFeaturesToTrack(const GMat  &image,
-                                                       int    maxCorners,
-                                                       double qualityLevel,
-                                                       double minDistance,
-                                                 const Mat   &mask = Mat(),
-                                                       int    blockSize = 3,
-                                                       bool   useHarrisDetector = false,
-                                                       double k = 0.04);
+GAPI_EXPORTS_W GArray<Point2f> goodFeaturesToTrack(const GMat   &image,
+                                                         int    maxCorners,
+                                                         double qualityLevel,
+                                                         double minDistance,
+                                                   const Mat    &mask = Mat(),
+                                                         int    blockSize = 3,
+                                                         bool   useHarrisDetector = false,
+                                                         double k = 0.04);
 
 /** @brief Equalizes the histogram of a grayscale image.
 
@@ -1098,7 +1098,7 @@ The algorithm normalizes the brightness and increases the contrast of the image.
 
 @param src Source 8-bit single channel image.
  */
-GAPI_EXPORTS GMat equalizeHist(const GMat& src);
+GAPI_EXPORTS_W GMat equalizeHist(const GMat& src);
 
 //! @addtogroup gapi_shape
 //! @{
@@ -1209,7 +1209,7 @@ Calculates the up-right bounding rectangle of a point set.
 
 @param src Input 2D point set, stored in std::vector<cv::Point2f>.
  */
-GAPI_EXPORTS GOpaque<Rect> boundingRect(const GArray<Point2f>& src);
+GAPI_EXPORTS_W GOpaque<Rect> boundingRect(const GArray<Point2f>& src);
 
 /** @brief Fits a line to a 2D point set.
 
@@ -1399,7 +1399,7 @@ Resulting gray color value computed as
 @param bY float multiplier for B channel.
 @sa RGB2YUV
  */
-GAPI_EXPORTS GMat RGB2Gray(const GMat& src, float rY, float gY, float bY);
+GAPI_EXPORTS_W GMat RGB2Gray(const GMat& src, float rY, float gY, float bY);
 
 /** @brief Converts an image from BGR color space to gray-scaled.
 
@@ -1412,7 +1412,7 @@ Resulting gray color value computed as
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
 @sa BGR2LUV
  */
-GAPI_EXPORTS GMat BGR2Gray(const GMat& src);
+GAPI_EXPORTS_W GMat BGR2Gray(const GMat& src);
 
 /** @brief Converts an image from RGB color space to YUV color space.
 
@@ -1429,7 +1429,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa YUV2RGB, RGB2Lab
 */
-GAPI_EXPORTS GMat RGB2YUV(const GMat& src);
+GAPI_EXPORTS_W GMat RGB2YUV(const GMat& src);
 
 /** @brief Converts an image from BGR color space to I420 color space.
 
@@ -1445,7 +1445,7 @@ Height of I420 output image must be equal 3/2 from height of input image.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa I4202BGR
 */
-GAPI_EXPORTS GMat BGR2I420(const GMat& src);
+GAPI_EXPORTS_W GMat BGR2I420(const GMat& src);
 
 /** @brief Converts an image from RGB color space to I420 color space.
 
@@ -1461,7 +1461,7 @@ Height of I420 output image must be equal 3/2 from height of input image.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa I4202RGB
 */
-GAPI_EXPORTS GMat RGB2I420(const GMat& src);
+GAPI_EXPORTS_W GMat RGB2I420(const GMat& src);
 
 /** @brief Converts an image from I420 color space to BGR color space.
 
@@ -1477,7 +1477,7 @@ Height of BGR output image must be equal 2/3 from height of input image.
 @param src input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
 @sa BGR2I420
 */
-GAPI_EXPORTS GMat I4202BGR(const GMat& src);
+GAPI_EXPORTS_W GMat I4202BGR(const GMat& src);
 
 /** @brief Converts an image from I420 color space to BGR color space.
 
@@ -1493,7 +1493,7 @@ Height of RGB output image must be equal 2/3 from height of input image.
 @param src input image: 8-bit unsigned 1-channel image @ref CV_8UC1.
 @sa RGB2I420
 */
-GAPI_EXPORTS GMat I4202RGB(const GMat& src);
+GAPI_EXPORTS_W GMat I4202RGB(const GMat& src);
 
 /** @brief Converts an image from BGR color space to LUV color space.
 
@@ -1507,7 +1507,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa RGB2Lab, RGB2LUV
 */
-GAPI_EXPORTS GMat BGR2LUV(const GMat& src);
+GAPI_EXPORTS_W GMat BGR2LUV(const GMat& src);
 
 /** @brief Converts an image from LUV color space to BGR color space.
 
@@ -1521,7 +1521,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa BGR2LUV
 */
-GAPI_EXPORTS GMat LUV2BGR(const GMat& src);
+GAPI_EXPORTS_W GMat LUV2BGR(const GMat& src);
 
 /** @brief Converts an image from YUV color space to BGR color space.
 
@@ -1535,7 +1535,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa BGR2YUV
 */
-GAPI_EXPORTS GMat YUV2BGR(const GMat& src);
+GAPI_EXPORTS_W GMat YUV2BGR(const GMat& src);
 
 /** @brief Converts an image from BGR color space to YUV color space.
 
@@ -1549,7 +1549,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
 @sa YUV2BGR
 */
-GAPI_EXPORTS GMat BGR2YUV(const GMat& src);
+GAPI_EXPORTS_W GMat BGR2YUV(const GMat& src);
 
 /** @brief Converts an image from RGB color space to Lab color space.
 
@@ -1563,7 +1563,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC1.
 @param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
 @sa RGB2YUV, RGB2LUV
 */
-GAPI_EXPORTS GMat RGB2Lab(const GMat& src);
+GAPI_EXPORTS_W GMat RGB2Lab(const GMat& src);
 
 /** @brief Converts an image from YUV color space to RGB.
 The function converts an input image from YUV color space to RGB.
@@ -1577,7 +1577,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 
 @sa RGB2Lab, RGB2YUV
 */
-GAPI_EXPORTS GMat YUV2RGB(const GMat& src);
+GAPI_EXPORTS_W GMat YUV2RGB(const GMat& src);
 
 /** @brief Converts an image from NV12 (YUV420p) color space to RGB.
 The function converts an input image from NV12 color space to RGB.
@@ -1592,7 +1592,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 
 @sa YUV2RGB, NV12toBGR
 */
-GAPI_EXPORTS GMat NV12toRGB(const GMat& src_y, const GMat& src_uv);
+GAPI_EXPORTS_W GMat NV12toRGB(const GMat& src_y, const GMat& src_uv);
 
 /** @brief Converts an image from NV12 (YUV420p) color space to gray-scaled.
 The function converts an input image from NV12 color space to gray-scaled.
@@ -1607,7 +1607,7 @@ Output image must be 8-bit unsigned 1-channel image @ref CV_8UC1.
 
 @sa YUV2RGB, NV12toBGR
 */
-GAPI_EXPORTS GMat NV12toGray(const GMat& src_y, const GMat& src_uv);
+GAPI_EXPORTS_W GMat NV12toGray(const GMat& src_y, const GMat& src_uv);
 
 /** @brief Converts an image from NV12 (YUV420p) color space to BGR.
 The function converts an input image from NV12 color space to RGB.
@@ -1622,7 +1622,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 
 @sa YUV2BGR, NV12toRGB
 */
-GAPI_EXPORTS GMat NV12toBGR(const GMat& src_y, const GMat& src_uv);
+GAPI_EXPORTS_W GMat NV12toBGR(const GMat& src_y, const GMat& src_uv);
 
 /** @brief Converts an image from BayerGR color space to RGB.
 The function converts an input image from BayerGR color space to RGB.
@@ -1636,7 +1636,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 
 @sa YUV2BGR, NV12toRGB
 */
-GAPI_EXPORTS GMat BayerGR2RGB(const GMat& src_gr);
+GAPI_EXPORTS_W GMat BayerGR2RGB(const GMat& src_gr);
 
 /** @brief Converts an image from RGB color space to HSV.
 The function converts an input image from RGB color space to HSV.
@@ -1650,7 +1650,7 @@ Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
 
 @sa YUV2BGR, NV12toRGB
 */
-GAPI_EXPORTS GMat RGB2HSV(const GMat& src);
+GAPI_EXPORTS_W GMat RGB2HSV(const GMat& src);
 
 /** @brief Converts an image from RGB color space to YUV422.
 The function converts an input image from RGB color space to YUV422.
@@ -1664,7 +1664,7 @@ Output image must be 8-bit unsigned 2-channel image @ref CV_8UC2.
 
 @sa YUV2BGR, NV12toRGB
 */
-GAPI_EXPORTS GMat RGB2YUV422(const GMat& src);
+GAPI_EXPORTS_W GMat RGB2YUV422(const GMat& src);
 
 /** @brief Converts an image from NV12 (YUV420p) color space to RGB.
 The function converts an input image from NV12 color space to RGB.
diff --git a/3rdParty/opencv2/gapi/infer.hpp b/3rdParty/opencv2/gapi/infer.hpp
index 269c44dcd3..4aa2484f02 100644
--- a/3rdParty/opencv2/gapi/infer.hpp
+++ b/3rdParty/opencv2/gapi/infer.hpp
@@ -101,8 +101,10 @@ class GInferOutputsTyped
         if (it == m_priv->blobs.end()) {
             // FIXME: Avoid modifying GKernel
             auto shape = cv::detail::GTypeTraits<OutT>::shape;
+            auto kind  = cv::detail::GTypeTraits<OutT>::op_kind;
             m_priv->call->kernel().outShapes.push_back(shape);
             m_priv->call->kernel().outCtors.emplace_back(cv::detail::GObtainCtor<OutT>::get());
+            m_priv->call->kernel().outKinds.emplace_back(kind);
             auto out_idx = static_cast<int>(m_priv->blobs.size());
             it = m_priv->blobs.emplace(name,
                     cv::detail::Yield<OutT>::yield(*(m_priv->call), out_idx)).first;
@@ -175,6 +177,7 @@ std::shared_ptr<cv::GCall> makeCall(const std::string         &tag,
                 {}, // outShape will be filled later
                 std::move(kinds),
                 {}, // outCtors will be filled later
+                {}, // outKinds will be filled later
             });
 
     call->setArgs(std::move(args));
@@ -397,7 +400,7 @@ void inline unpackBlobs(const cv::GInferInputs::Map& blobs,
                 kinds.emplace_back(cv::detail::OpaqueKind::CV_UNKNOWN);
                 break;
             default:
-                GAPI_Assert(false);
+                GAPI_Error("InternalError");
         }
     }
 }
@@ -629,7 +632,7 @@ infer2(const std::string& tag,
                 kinds.emplace_back(cv::detail::OpaqueKind::CV_RECT);
                 break;
             default:
-                GAPI_Assert(false);
+                GAPI_Error("InternalError");
         }
     }
 
diff --git a/3rdParty/opencv2/gapi/infer/bindings_onnx.hpp b/3rdParty/opencv2/gapi/infer/bindings_onnx.hpp
new file mode 100644
index 0000000000..f7bb259924
--- /dev/null
+++ b/3rdParty/opencv2/gapi/infer/bindings_onnx.hpp
@@ -0,0 +1,74 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level
+// directory of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
+#define OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
+
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/infer/onnx.hpp>  // Params
+#include "opencv2/gapi/own/exports.hpp"  // GAPI_EXPORTS
+#include <opencv2/gapi/util/any.hpp>
+
+#include <string>
+
+namespace cv {
+namespace gapi {
+namespace onnx {
+
+// NB: Used by python wrapper
+// This class can be marked as SIMPLE, because it's implemented as pimpl
+class GAPI_EXPORTS_W_SIMPLE PyParams {
+public:
+    GAPI_WRAP
+    PyParams() = default;
+
+    GAPI_WRAP
+    PyParams(const std::string& tag, const std::string& model_path);
+
+    GAPI_WRAP
+    PyParams& cfgMeanStd(const std::string &layer_name,
+                         const cv::Scalar &m,
+                         const cv::Scalar &s);
+    GAPI_WRAP
+    PyParams& cfgNormalize(const std::string &layer_name, bool flag);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::OpenVINO ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::DirectML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CoreML ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::CUDA ep);
+
+    GAPI_WRAP
+    PyParams& cfgAddExecutionProvider(ep::TensorRT ep);
+
+    GAPI_WRAP
+    PyParams& cfgDisableMemPattern();
+
+    GAPI_WRAP
+    PyParams& cfgSessionOptions(const std::map<std::string, std::string>& options);
+
+    GAPI_WRAP
+    PyParams& cfgOptLevel(const int opt_level);
+
+    GBackend backend() const;
+    std::string tag() const;
+    cv::util::any params() const;
+
+private:
+    std::shared_ptr<Params<cv::gapi::Generic>> m_priv;
+};
+
+GAPI_EXPORTS_W PyParams params(const std::string& tag, const std::string& model_path);
+
+}  // namespace onnx
+}  // namespace gapi
+}  // namespace cv
+
+#endif  // OPENCV_GAPI_INFER_BINDINGS_ONNX_HPP
diff --git a/3rdParty/opencv2/gapi/infer/bindings_ov.hpp b/3rdParty/opencv2/gapi/infer/bindings_ov.hpp
new file mode 100644
index 0000000000..08f5c83a3f
--- /dev/null
+++ b/3rdParty/opencv2/gapi/infer/bindings_ov.hpp
@@ -0,0 +1,128 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_BINDINGS_OV_HPP
+#define OPENCV_GAPI_INFER_BINDINGS_OV_HPP
+
+#include <opencv2/gapi/util/any.hpp>
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp>     // GKernelPackage
+#include <opencv2/gapi/infer/ov.hpp>    // Params
+
+#include <string>
+
+namespace cv {
+namespace gapi {
+namespace ov {
+
+// NB: Used by python wrapper
+// This class can be marked as SIMPLE, because it's implemented as pimpl
+class GAPI_EXPORTS_W_SIMPLE PyParams {
+public:
+    GAPI_WRAP
+    PyParams() = default;
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &model_path,
+             const std::string &bin_path,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams(const std::string &tag,
+             const std::string &blob_path,
+             const std::string &device);
+
+    GAPI_WRAP
+    PyParams& cfgPluginConfig(
+            const std::map<std::string, std::string> &config);
+
+    GAPI_WRAP
+    PyParams& cfgInputTensorLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgInputTensorLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgInputModelLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgInputModelLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputModelLayout(std::string tensor_layout);
+
+    GAPI_WRAP
+    PyParams& cfgOutputModelLayout(
+            std::map<std::string, std::string> layout_map);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorPrecision(int precision);
+
+    GAPI_WRAP
+    PyParams& cfgOutputTensorPrecision(
+            std::map<std::string, int> precision_map);
+
+    GAPI_WRAP
+    PyParams& cfgReshape(std::vector<size_t> new_shape);
+
+    GAPI_WRAP
+    PyParams& cfgReshape(
+            std::map<std::string, std::vector<size_t>> new_shape_map);
+
+    GAPI_WRAP
+    PyParams& cfgNumRequests(const size_t nireq);
+
+    GAPI_WRAP
+    PyParams& cfgMean(std::vector<float> mean_values);
+
+    GAPI_WRAP
+    PyParams& cfgMean(
+            std::map<std::string, std::vector<float>> mean_map);
+
+    GAPI_WRAP
+    PyParams& cfgScale(std::vector<float> scale_values);
+
+    GAPI_WRAP
+    PyParams& cfgScale(
+            std::map<std::string, std::vector<float>> scale_map);
+
+    GAPI_WRAP
+    PyParams& cfgResize(int interpolation);
+
+    GAPI_WRAP
+    PyParams& cfgResize(std::map<std::string, int> interpolation);
+
+    GBackend      backend() const;
+    std::string   tag()     const;
+    cv::util::any params()  const;
+
+private:
+    std::shared_ptr<Params<cv::gapi::Generic>> m_priv;
+};
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &model_path,
+                               const std::string &weights,
+                               const std::string &device);
+
+GAPI_EXPORTS_W PyParams params(const std::string &tag,
+                               const std::string &bin_path,
+                               const std::string &device);
+} // namespace ov
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_BINDINGS_OV_HPP
diff --git a/3rdParty/opencv2/gapi/infer/ie.hpp b/3rdParty/opencv2/gapi/infer/ie.hpp
index b5f7e14255..4552d2db34 100644
--- a/3rdParty/opencv2/gapi/infer/ie.hpp
+++ b/3rdParty/opencv2/gapi/infer/ie.hpp
@@ -2,7 +2,7 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 //
-// Copyright (C) 2019-2021 Intel Corporation
+// Copyright (C) 2019-2023 Intel Corporation
 
 #ifndef OPENCV_GAPI_INFER_IE_HPP
 #define OPENCV_GAPI_INFER_IE_HPP
@@ -52,7 +52,24 @@ enum class TraitAs: int
 
 using IEConfig = std::map<std::string, std::string>;
 
+enum InferMode {Sync, Async};
+
 namespace detail {
+
+template <typename T>
+using AttrMap = std::map<std::string, T>;
+// NB: This type is used to hold in/out layers
+// attributes such as precision, layout, shape etc.
+//
+// User can provide attributes either:
+// 1. cv::util::monostate - No value specified explicitly.
+// 2. Attr - value specified explicitly that should be broadcasted to all layers.
+// 3. AttrMap[str->T] - map specifies value for particular layer.
+template <typename Attr>
+using LayerVariantAttr = cv::util::variant< cv::util::monostate
+                                          , AttrMap<Attr>
+                                          , Attr>;
+
 struct ParamDesc {
     std::string model_path;
     std::string weights_path;
@@ -88,6 +105,24 @@ struct ParamDesc {
 
     cv::optional<cv::gapi::wip::onevpl::Device> vpl_preproc_device;
     cv::optional<cv::gapi::wip::onevpl::Context> vpl_preproc_ctx;
+
+    InferMode mode;
+
+    using PrecisionT = int;
+    using PrecisionMapT = std::unordered_map<std::string, PrecisionT>;
+    // NB: This parameter can contain:
+    // 1. cv::util::monostate - Don't specify precision, but use default from IR/Blob.
+    // 2. PrecisionT (CV_8U, CV_32F, ...) - Specifies precision for all output layers.
+    // 3. PrecisionMapT ({{"layer0", CV_32F}, {"layer1", CV_16F}} - Specifies precision for certain output layer.
+    // cv::util::monostate is default value that means precision wasn't specified.
+    using PrecisionVariantT = cv::util::variant<cv::util::monostate,
+                                                PrecisionT,
+                                                PrecisionMapT>;
+
+    PrecisionVariantT output_precision;
+    LayerVariantAttr<std::string> input_layout;
+    LayerVariantAttr<std::string> output_layout;
+    LayerVariantAttr<int>         interpolation;
 };
 } // namespace detail
 
@@ -132,8 +167,13 @@ template<typename Net> class Params {
               , {}
               , {}
               , {}
-              , {}} {
-    };
+              , {}
+              , InferMode::Async
+              , {}
+              , {}
+              , {}
+              , {} } {
+    }
 
     /** @overload
     Use this constructor to work with pre-compiled network.
@@ -156,8 +196,13 @@ template<typename Net> class Params {
               , {}
               , {}
               , {}
-              , {}} {
-    };
+              , {}
+              , InferMode::Async
+              , {}
+              , {}
+              , {}
+              , {} } {
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
@@ -222,7 +267,7 @@ template<typename Net> class Params {
     @param cfg Map of pairs: (config parameter name, config parameter value).
     @return reference to this parameter structure.
     */
-       Params& pluginConfig(const IEConfig& cfg) {
+    Params& pluginConfig(const IEConfig& cfg) {
         desc.config = cfg;
         return *this;
     }
@@ -351,6 +396,121 @@ template<typename Net> class Params {
         return *this;
     }
 
+    /** @brief Specifies which api will be used to run inference.
+
+    The function is used to specify mode for OpenVINO inference.
+    OpenVINO has two options to run inference:
+    1. Asynchronous (using StartAsync: https://docs.openvino.ai/latest/classInferenceEngine_1_1InferRequest.html#doxid-class-inference-engine-1-1-infer-request-1a405293e8423d82a5b45f642a3bef0d24)
+    2. Synchronous (using Infer: https://docs.openvino.ai/latest/classInferenceEngine_1_1InferRequest.html#doxid-class-inference-engine-1-1-infer-request-1a3391ce30894abde730523e9ca9371ce8)
+    By default asynchronous mode is used.
+
+    @param mode Inference mode which will be used.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInferMode(InferMode mode) {
+        desc.mode = mode;
+        return *this;
+    }
+
+    /** @brief Specifies the output precision for model.
+
+    The function is used to set an output precision for model.
+
+    @param precision Precision in OpenCV format (CV_8U, CV_32F, ...)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputPrecision(detail::ParamDesc::PrecisionT precision) {
+        desc.output_precision = precision;
+        return *this;
+    }
+
+    /** @overload
+
+    @param precision_map Map of pairs: name of corresponding output layer
+    and its precision in OpenCV format (CV_8U, CV_32F, ...)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputPrecision(detail::ParamDesc::PrecisionMapT precision_map) {
+        desc.output_precision = precision_map;
+        return *this;
+    }
+
+    /** @brief Specifies the input layout for model.
+
+    The function is used to set an input layout for model.
+
+    @param layout Layout in string representation ("NCHW", "NHWC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputLayout(std::string layout) {
+        desc.input_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its layout in string representation ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.input_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies the output layout for model.
+
+    The function is used to set an output layout for model.
+
+    @param layout Layout in string representation ("NCHW", "NHWC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputLayout(std::string layout) {
+        desc.output_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its layout in string representation ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.output_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies resize interpolation algorithm.
+     *
+    The function is used to configure resize preprocessing for input layer.
+
+    @param interpolation Resize interpolation algorithm.
+    Supported algorithms: #INTER_LINEAR, #INTER_AREA.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(int interpolation) {
+        desc.interpolation = interpolation;
+        return *this;
+    }
+
+    /** @overload
+
+    @param interpolation Map of pairs: name of corresponding input layer
+    and its resize algorithm.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(detail::AttrMap<int> interpolation) {
+        desc.interpolation = std::move(interpolation);
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend()    const { return cv::gapi::ie::backend();  }
     std::string   tag()        const { return Net::tag(); }
@@ -385,9 +545,9 @@ class Params<cv::gapi::Generic> {
            const std::string &device)
         : desc{ model, weights, device, {}, {}, {}, 0u, 0u,
                 detail::ParamDesc::Kind::Load, true, {}, {}, {}, 1u,
-                {}, {}, {}, {}},
+                {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @overload
 
@@ -403,9 +563,9 @@ class Params<cv::gapi::Generic> {
            const std::string &device)
         : desc{ model, {}, device, {}, {}, {}, 0u, 0u,
                 detail::ParamDesc::Kind::Import, true, {}, {}, {}, 1u,
-                {}, {}, {}, {}},
+                {}, {}, {}, {}, InferMode::Async, {}, {}, {}, {} },
           m_tag(tag) {
-    };
+    }
 
     /** @see ie::Params::pluginConfig. */
     Params& pluginConfig(const IEConfig& cfg) {
@@ -476,6 +636,63 @@ class Params<cv::gapi::Generic> {
         return *this;
     }
 
+    /** @see ie::Params::cfgInferAPI */
+    Params& cfgInferMode(InferMode mode) {
+        desc.mode = mode;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgOutputPrecision */
+    Params& cfgOutputPrecision(detail::ParamDesc::PrecisionT precision) {
+        desc.output_precision = precision;
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputPrecision(detail::ParamDesc::PrecisionMapT precision_map) {
+        desc.output_precision = precision_map;
+        return *this;
+    }
+
+    /** @see ie::Params::cfgInputLayout */
+    Params& cfgInputLayout(std::string layout) {
+        desc.input_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.input_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ie::Params::cfgOutputLayout */
+    Params& cfgOutputLayout(std::string layout) {
+        desc.output_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputLayout(detail::AttrMap<std::string> layout_map) {
+        desc.output_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ie::Params::cfgResize */
+    Params& cfgResize(int interpolation) {
+        desc.interpolation = interpolation;
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgResize(detail::AttrMap<int> interpolation) {
+        desc.interpolation = std::move(interpolation);
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend()    const { return cv::gapi::ie::backend();  }
     std::string   tag()        const { return m_tag; }
diff --git a/3rdParty/opencv2/gapi/infer/onnx.hpp b/3rdParty/opencv2/gapi/infer/onnx.hpp
index ecdf187f38..9d93246da1 100644
--- a/3rdParty/opencv2/gapi/infer/onnx.hpp
+++ b/3rdParty/opencv2/gapi/infer/onnx.hpp
@@ -11,12 +11,15 @@
 #include <string>
 #include <array>
 #include <tuple> // tuple, tuple_size
+#include <map>
 
 #include <opencv2/gapi/opencv_includes.hpp>
 #include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/util/optional.hpp>
 
 #include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
 #include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/infer.hpp>   // Generic
 
 namespace cv {
 namespace gapi {
@@ -26,6 +29,277 @@ namespace gapi {
  */
 namespace onnx {
 
+/**
+ * @brief This namespace contains Execution Providers structures for G-API ONNX Runtime backend.
+ */
+namespace ep {
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX CoreML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CoreML {
+    /** @brief Class constructor.
+
+    Constructs CoreML parameters.
+
+    */
+    GAPI_WRAP
+    CoreML() = default;
+
+    /** @brief Limit CoreML Execution Provider to run on CPU only.
+
+    This function is used to limit CoreML to run on CPU only.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_use_cpu_only
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgUseCPUOnly() {
+        use_cpu_only = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run on a subgraph in the body of a control flow ONNX operator (i.e. a Loop, Scan or If operator).
+
+    This function is used to enable CoreML EP to run on
+    a subgraph of a control flow of ONNX operation.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_enable_on_subgraph
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnSubgraph() {
+        enable_on_subgraph = true;
+        return *this;
+    }
+
+    /** @brief Enable CoreML EP to run only on Apple Neural Engine.
+
+    This function is used to enable CoreML EP to run only on Apple Neural Engine.
+    Please follow: https://onnxruntime.ai/docs/execution-providers/CoreML-ExecutionProvider.html#coreml_flag_only_enable_device_with_ane
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    CoreML& cfgEnableOnlyNeuralEngine() {
+        enable_only_ane = true;
+        return *this;
+    }
+
+    bool use_cpu_only = false;
+    bool enable_on_subgraph = false;
+    bool enable_only_ane = false;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for CUDA Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#cuda-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE CUDA {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    CUDA() = default;
+
+    /** @brief Class constructor.
+
+    Constructs CUDA parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit CUDA(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for TensorRT Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#tensorrt-execution-provider
+ */
+struct GAPI_EXPORTS_W_SIMPLE TensorRT {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    TensorRT() = default;
+
+    /** @brief Class constructor.
+
+    Constructs TensorRT parameters based on device type information.
+
+    @param dev_id Target device id to use.
+    */
+    GAPI_WRAP
+    explicit TensorRT(const int dev_id)
+        : device_id(dev_id) {
+    }
+
+    int device_id;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX OpenVINO Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options
+ */
+struct GAPI_EXPORTS_W_SIMPLE OpenVINO {
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    OpenVINO() = default;
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on device type information.
+
+    @param dev_type Target device type to use. ("CPU", "GPU", "GPU.0" etc)
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::string &dev_type)
+        : device_type(dev_type) {
+    }
+
+    /** @brief Class constructor.
+
+    Constructs OpenVINO parameters based on map of options passed.
+
+    * @param params A map of parameter names and their corresponding string values.
+    */
+    GAPI_WRAP
+    explicit OpenVINO(const std::map<std::string, std::string>& params)
+        : params_map(params) {
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider cache dir.
+
+    This function is used to explicitly specify the path to save and load
+    the blobs enabling model caching feature.
+
+    @param dir Path to the directory what will be used as cache.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgCacheDir(const std::string &dir) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        cache_dir = dir;
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO Execution Provider number of threads.
+
+    This function is used to override the accelerator default value
+    of number of threads with this value at runtime.
+
+    @param nthreads Number of threads.
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgNumThreads(size_t nthreads) {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        num_of_threads = nthreads;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider opencl throttling.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableOpenCLThrottling() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_opencl_throttling = true;
+        return *this;
+    }
+
+    /** @brief Enables OpenVINO Execution Provider dynamic shapes.
+
+    This function is used to enable OpenCL queue throttling for GPU devices
+    (reduces CPU utilization when using GPU).
+    This function is used to enable work with dynamic shaped models
+    whose shape will be set dynamically based on the infer input
+    image/data shape at run time in CPU.
+
+    @return reference to this parameter structure.
+    */
+    GAPI_WRAP
+    OpenVINO& cfgEnableDynamicShapes() {
+        if (!params_map.empty()) {
+            cv::util::throw_error(std::logic_error("ep::OpenVINO cannot be changed if"
+                                                   "created from the parameters map."));
+        }
+        enable_dynamic_shapes = true;
+        return *this;
+    }
+
+    std::string device_type;
+    std::string cache_dir;
+    size_t num_of_threads = 0;
+    bool enable_opencl_throttling = false;
+    bool enable_dynamic_shapes = false;
+    std::map<std::string, std::string> params_map;
+};
+
+/**
+ * @brief This structure provides functions
+ * that fill inference options for ONNX DirectML Execution Provider.
+ * Please follow https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html#directml-execution-provider
+ */
+class GAPI_EXPORTS_W_SIMPLE DirectML {
+public:
+    // NB: Used from python.
+    /// @private -- Exclude this constructor from OpenCV documentation
+    GAPI_WRAP
+    DirectML() = default;
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on device id.
+
+    @param device_id Target device id to use. ("0", "1", etc)
+    */
+    GAPI_WRAP
+    explicit DirectML(const int device_id) : ddesc(device_id) { };
+
+    /** @brief Class constructor.
+
+    Constructs DirectML parameters based on adapter name.
+
+    @param adapter_name Target adapter_name to use.
+    */
+    GAPI_WRAP
+    explicit DirectML(const std::string &adapter_name) : ddesc(adapter_name) { };
+
+    using DeviceDesc = cv::util::variant<int, std::string>;
+    DeviceDesc ddesc;
+};
+
+using EP = cv::util::variant< cv::util::monostate
+                            , OpenVINO
+                            , DirectML
+                            , CoreML
+                            , CUDA
+                            , TensorRT>;
+
+} // namespace ep
+
 GAPI_EXPORTS cv::gapi::GBackend backend();
 
 enum class TraitAs: int {
@@ -67,6 +341,21 @@ struct ParamDesc {
     std::vector<bool> normalize; //!< Vector of bool values that enabled or disabled normalize of input data.
 
     std::vector<std::string> names_to_remap; //!< Names of output layers that will be processed in PostProc function.
+
+    bool is_generic;
+
+    // TODO: Needs to modify the rest of ParamDesc accordingly to support
+    // both generic and non-generic options without duplication
+    // (as it was done for the OV IE backend)
+    // These values are pushed into the respective vector<> fields above
+    // when the generic infer parameters are unpacked (see GONNXBackendImpl::unpackKernel)
+    std::unordered_map<std::string, std::pair<cv::Scalar, cv::Scalar> > generic_mstd;
+    std::unordered_map<std::string, bool> generic_norm;
+
+    std::map<std::string, std::string> session_options;
+    std::vector<cv::gapi::onnx::ep::EP> execution_providers;
+    bool disable_mem_pattern;
+    cv::util::optional<int> opt_level;
 };
 } // namespace detail
 
@@ -103,7 +392,9 @@ template<typename Net> class Params {
         desc.model_path = model;
         desc.num_in  = std::tuple_size<typename Net::InArgs>::value;
         desc.num_out = std::tuple_size<typename Net::OutArgs>::value;
-    };
+        desc.is_generic = false;
+        desc.disable_mem_pattern = false;
+    }
 
     /** @brief Specifies sequence of network input layers names for inference.
 
@@ -267,6 +558,109 @@ template<typename Net> class Params {
         return *this;
     }
 
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime OpenVINO Execution Provider options.
+
+    @param ep OpenVINO Execution Provider options.
+    @see cv::gapi::onnx::ep::OpenVINO.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime DirectML Execution Provider options.
+
+    @param ep DirectML Execution Provider options.
+    @see cv::gapi::onnx::ep::DirectML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CoreML Execution Provider options.
+
+    @param ep CoreML Execution Provider options.
+    @see cv::gapi::onnx::ep::CoreML.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime CUDA Execution Provider options.
+
+    @param ep CUDA Execution Provider options.
+    @see cv::gapi::onnx::ep::CUDA.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Adds execution provider for runtime.
+
+    The function is used to add ONNX Runtime TensorRT Execution Provider options.
+
+    @param ep TensorRT Execution Provider options.
+    @see cv::gapi::onnx::ep::TensorRT.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+        return *this;
+    }
+
+    /** @brief Disables the memory pattern optimization.
+
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+        return *this;
+    }
+
+    /** @brief Configures session options for ONNX Runtime.
+
+    This function is used to set various session options for the ONNX Runtime
+    session by accepting a map of key-value pairs.
+
+    @param options A map of session option to be applied to the ONNX Runtime session.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgSessionOptions(const std::map<std::string, std::string>& options) {
+        desc.session_options.insert(options.begin(), options.end());
+        return *this;
+    }
+
+    /** @brief Configures optimization level for ONNX Runtime.
+
+    @param opt_level [optimization level]: Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).
+    Please see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.
+    @return the reference on modified object.
+    */
+    Params<Net>& cfgOptLevel(const int opt_level) {
+        desc.opt_level = cv::util::make_optional(opt_level);
+        return *this;
+    }
+
     // BEGIN(G-API's network parametrization API)
     GBackend      backend() const { return cv::gapi::onnx::backend(); }
     std::string   tag()     const { return Net::tag(); }
@@ -277,6 +671,87 @@ template<typename Net> class Params {
     detail::ParamDesc desc;
 };
 
+/*
+* @brief This structure provides functions for generic network type that
+* fill inference parameters.
+* @see struct Generic
+*/
+template<>
+class Params<cv::gapi::Generic> {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on input information and sets default values for other
+    inference description parameters.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model_path path to model file (.onnx file).
+    */
+    Params(const std::string& tag, const std::string& model_path)
+        : desc{ model_path, 0u, 0u, {}, {}, {}, {}, {}, {}, {}, {}, {}, true, {}, {}, {}, {}, false, {} }, m_tag(tag) {}
+
+    /** @see onnx::Params::cfgMeanStdDev. */
+    void cfgMeanStdDev(const std::string &layer,
+                       const cv::Scalar &m,
+                       const cv::Scalar &s) {
+        desc.generic_mstd[layer] = std::make_pair(m, s);
+    }
+
+    /** @see onnx::Params::cfgNormalize. */
+    void cfgNormalize(const std::string &layer, bool flag) {
+        desc.generic_norm[layer] = flag;
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::OpenVINO&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::DirectML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CoreML&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::CUDA&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgAddExecutionProvider. */
+    void cfgAddExecutionProvider(ep::TensorRT&& ep) {
+        desc.execution_providers.emplace_back(std::move(ep));
+    }
+
+    /** @see onnx::Params::cfgDisableMemPattern. */
+    void cfgDisableMemPattern() {
+        desc.disable_mem_pattern = true;
+    }
+
+    /** @see onnx::Params::cfgSessionOptions. */
+    void cfgSessionOptions(const std::map<std::string, std::string>& options) {
+        desc.session_options.insert(options.begin(), options.end());
+    }
+
+/** @see onnx::Params::cfgOptLevel. */
+    void cfgOptLevel(const int opt_level) {
+        desc.opt_level = cv::util::make_optional(opt_level);
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::onnx::backend(); }
+    std::string   tag()     const { return m_tag; }
+    cv::util::any params()  const { return { desc }; }
+    // END(G-API's network parametrization API)
+protected:
+    detail::ParamDesc desc;
+    std::string m_tag;
+};
+
 } // namespace onnx
 } // namespace gapi
 } // namespace cv
diff --git a/3rdParty/opencv2/gapi/infer/ov.hpp b/3rdParty/opencv2/gapi/infer/ov.hpp
new file mode 100644
index 0000000000..782792489b
--- /dev/null
+++ b/3rdParty/opencv2/gapi/infer/ov.hpp
@@ -0,0 +1,709 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_INFER_OV_HPP
+#define OPENCV_GAPI_INFER_OV_HPP
+
+#include <string>
+
+#include <opencv2/gapi/util/any.hpp>
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp>     // GKernelType[M], GBackend
+#include <opencv2/gapi/infer.hpp>       // Generic
+
+#include <map>
+
+namespace cv {
+namespace gapi {
+
+/**
+ * @brief This namespace contains G-API OpenVINO 2.0 backend functions,
+ * structures, and symbols.
+ */
+namespace ov {
+
+GAPI_EXPORTS cv::gapi::GBackend backend();
+
+namespace detail {
+
+template <typename T>
+using AttrMap = std::map<std::string, T>;
+// NB: This type is supposed to be used to hold in/out layers
+// attributes such as precision, layout, shape etc.
+//
+// User can provide attributes either:
+// 1. cv::util::monostate - No value specified explicitly.
+// 2. Attr - value specified explicitly that should be broadcasted to all layers.
+// 3. AttrMap[str->T] - map specifies value for particular layer.
+template <typename Attr>
+using LayerVariantAttr = cv::util::variant< cv::util::monostate
+                                          , AttrMap<Attr>
+                                          , Attr>;
+
+struct ParamDesc {
+    struct Model {
+
+        Model(const std::string &model_path_,
+              const std::string &bin_path_)
+            : model_path(model_path_), bin_path(bin_path_) {
+        }
+
+        std::string model_path;
+        std::string bin_path;
+
+        LayerVariantAttr<std::string> input_tensor_layout;
+        LayerVariantAttr<std::string> input_model_layout;
+        LayerVariantAttr<std::string> output_tensor_layout;
+        LayerVariantAttr<std::string> output_model_layout;
+        LayerVariantAttr<int>         output_tensor_precision;
+
+        LayerVariantAttr<std::vector<size_t>> new_shapes;
+
+        LayerVariantAttr<std::vector<float>> mean_values;
+        LayerVariantAttr<std::vector<float>> scale_values;
+
+        LayerVariantAttr<int> interpolation;
+    };
+
+    struct CompiledModel {
+        std::string blob_path;
+    };
+
+    using Kind = cv::util::variant<Model, CompiledModel>;
+
+    ParamDesc(Kind              &&kind_,
+              const std::string &device_,
+              const bool        is_generic_,
+              const size_t      num_in_,
+              const size_t      num_out_)
+        : kind(std::move(kind_)), device(device_),
+          is_generic(is_generic_),
+          num_in(num_in_), num_out(num_out_) {
+    }
+
+    Kind kind;
+
+    std::string device;
+    bool is_generic;
+
+    std::size_t num_in;
+    std::size_t num_out;
+
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    using PluginConfigT = std::map<std::string, std::string>;
+    PluginConfigT config;
+
+    size_t nireq = 1;
+};
+
+// NB: Just helper to avoid code duplication.
+static detail::ParamDesc::Model&
+getModelToSetAttrOrThrow(detail::ParamDesc::Kind  &kind,
+                         const std::string        &attr_name) {
+    if (cv::util::holds_alternative<detail::ParamDesc::CompiledModel>(kind)) {
+        cv::util::throw_error(
+                std::logic_error("Specifying " + attr_name + " isn't"
+                                 " possible for compiled model."));
+    }
+    GAPI_Assert(cv::util::holds_alternative<detail::ParamDesc::Model>(kind));
+    return cv::util::get<detail::ParamDesc::Model>(kind);
+}
+
+} // namespace detail
+
+/**
+ * @brief This structure provides functions
+ * that fill inference parameters for "OpenVINO Toolkit" model.
+ */
+template<typename Net> struct Params {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and specifies default values for other
+    inference description parameters. Model is loaded and compiled using "OpenVINO Toolkit".
+
+    @param model_path Path to a model.
+    @param bin_path Path to a data file.
+    For IR format (*.bin):
+    If path is empty, will try to read a bin file with the same name as xml.
+    If the bin file with the same name is not found, will load IR without weights.
+    For PDPD (*.pdmodel) and ONNX (*.onnx) formats bin_path isn't used.
+    @param device target device to use.
+    */
+    Params(const std::string &model_path,
+           const std::string &bin_path,
+           const std::string &device)
+        : m_desc( detail::ParamDesc::Kind{detail::ParamDesc::Model{model_path, bin_path}}
+                 , device
+                 , false /* is generic */
+                 , std::tuple_size<typename Net::InArgs>::value
+                 , std::tuple_size<typename Net::OutArgs>::value) {
+    }
+
+    /** @overload
+    Use this constructor to work with pre-compiled network.
+    Model is imported from a pre-compiled blob.
+
+    @param blob_path path to the compiled model (*.blob).
+    @param device target device to use.
+    */
+    Params(const std::string &blob_path,
+           const std::string &device)
+        : m_desc( detail::ParamDesc::Kind{detail::ParamDesc::CompiledModel{blob_path}}
+                 , device
+                 , false /* is generic */
+                 , std::tuple_size<typename Net::InArgs>::value
+                 , std::tuple_size<typename Net::OutArgs>::value) {
+    }
+
+    /** @brief Specifies sequence of network input layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> inputs with the model inputs.
+    Number of names has to match the number of network inputs as defined in G_API_NET().
+    In case a network has only single input layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of inputs
+    as defined in the @ref G_API_NET. Contains names of input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputLayers(const std::vector<std::string> &layer_names) {
+        m_desc.input_names = layer_names;
+        return *this;
+    }
+
+    /** @brief Specifies sequence of network output layers names for inference.
+
+    The function is used to associate cv::gapi::infer<> outputs with the model outputs.
+    Number of names has to match the number of network outputs as defined in G_API_NET().
+    In case a network has only single output layer, there is no need to specify name manually.
+
+    @param layer_names std::array<std::string, N> where N is the number of outputs
+    as defined in the @ref G_API_NET. Contains names of output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputLayers(const std::vector<std::string> &layer_names) {
+        m_desc.output_names = layer_names;
+        return *this;
+    }
+
+    /** @brief Specifies OpenVINO plugin configuration.
+
+    The function is used to set configuration for OpenVINO plugin. Some parameters
+    can be different for each plugin. Please follow https://docs.openvinotoolkit.org/latest/index.html
+    to check information about specific plugin.
+
+    @param config Map of pairs: (config parameter name, config parameter value).
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgPluginConfig(const detail::ParamDesc::PluginConfigT &config) {
+        m_desc.config = config;
+        return *this;
+    }
+
+    /** @brief Specifies tensor layout for an input layer.
+
+    The function is used to set tensor layout for an input layer.
+
+    @param layout Tensor layout ("NCHW", "NWHC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its tensor layout represented in std::string ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies model layout for an input layer.
+
+    The function is used to set model layout for an input layer.
+
+    @param layout Model layout ("NCHW", "NHWC", etc)
+    will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgInputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding input layer
+    and its model layout ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgInputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies tensor layout for an output layer.
+
+    The function is used to set tensor layout for an output layer.
+
+    @param layout Tensor layout ("NCHW", "NWHC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its tensor layout represented in std::string ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies model layout for an output layer.
+
+    The function is used to set model layout for an output layer.
+
+    @param layout Model layout ("NCHW", "NHWC", etc)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload
+    @param layout_map Map of pairs: name of corresponding output layer
+    and its model layout ("NCHW", "NHWC", etc)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @brief Specifies tensor precision for an output layer.
+
+    The function is used to set tensor precision for an output layer..
+
+    @param precision Precision in OpenCV format (CV_8U, CV_32F, ...)
+    will be applied to all output layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgOutputTensorPrecision(int precision) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = precision;
+        return *this;
+    }
+
+    /** @overload
+
+    @param precision_map Map of pairs: name of corresponding output layer
+    and its precision in OpenCV format (CV_8U, CV_32F, ...)
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgOutputTensorPrecision(detail::AttrMap<int> precision_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = std::move(precision_map);
+        return *this;
+    }
+
+    /** @brief Specifies the new shape for input layers.
+
+    The function is used to set new shape for input layers.
+
+    @param new_shape New shape will be applied to all input layers.
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgReshape(std::vector<size_t> new_shape) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape);
+        return *this;
+    }
+
+    /** @overload
+
+    @param new_shape_map Map of pairs: name of corresponding output layer
+    and its new shape.
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgReshape(detail::AttrMap<std::vector<size_t>> new_shape_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape_map);
+        return *this;
+    }
+
+    /** @brief Specifies number of asynchronous inference requests.
+
+    @param nireq Number of inference asynchronous requests.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgNumRequests(const size_t nireq) {
+        if (nireq == 0) {
+            cv::util::throw_error(
+                    std::logic_error("Number of inference requests"
+                                     " must be greater than zero."));
+        }
+        m_desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @brief Specifies mean values for preprocessing.
+     *
+    The function is used to set mean values for input layer preprocessing.
+
+    @param mean_values Float vector contains mean values
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgMean(std::vector<float> mean_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_values);
+        return *this;
+    }
+
+    /** @overload
+
+    @param mean_map Map of pairs: name of corresponding input layer
+    and its mean values.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgMean(detail::AttrMap<std::vector<float>> mean_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_map);
+        return *this;
+    }
+
+    /** @brief Specifies scale values for preprocessing.
+     *
+    The function is used to set scale values for input layer preprocessing.
+
+    @param scale_values Float vector contains scale values
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgScale(std::vector<float> scale_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_values);
+        return *this;
+    }
+
+    /** @overload
+
+    @param scale_map Map of pairs: name of corresponding input layer
+    and its mean values.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgScale(detail::AttrMap<std::vector<float>> scale_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_map);
+        return *this;
+    }
+
+    /** @brief Specifies resize interpolation algorithm.
+     *
+    The function is used to configure resize preprocessing for input layer.
+
+    @param interpolation Resize interpolation algorithm.
+    Supported algorithms: #INTER_NEAREST, #INTER_LINEAR, #INTER_CUBIC.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(int interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    /** @overload
+
+    @param interpolation Map of pairs: name of corresponding input layer
+    and its resize algorithm.
+    @return reference to this parameter structure.
+    */
+    Params<Net>& cfgResize(detail::AttrMap<int> interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::ov::backend(); }
+    std::string   tag()     const { return Net::tag(); }
+    cv::util::any params()  const { return { m_desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    detail::ParamDesc m_desc;
+};
+
+/*
+* @brief This structure provides functions for generic network type that
+* fill inference parameters.
+* @see struct Generic
+*/
+template<>
+class Params<cv::gapi::Generic> {
+public:
+    /** @brief Class constructor.
+
+    Constructs Params based on model information and specifies default values for other
+    inference description parameters. Model is loaded and compiled using "OpenVINO Toolkit".
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param model_path Path to a model.
+    @param bin_path Path to a data file.
+    For IR format (*.bin):
+    If path is empty, will try to read a bin file with the same name as xml.
+    If the bin file with the same name is not found, will load IR without weights.
+    For PDPD (*.pdmodel) and ONNX (*.onnx) formats bin_path isn't used.
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &model_path,
+           const std::string &bin_path,
+           const std::string &device)
+        : m_tag(tag),
+          m_desc( detail::ParamDesc::Kind{detail::ParamDesc::Model{model_path, bin_path}}
+                , device
+                , true /* is generic */
+                , 0u
+                , 0u) {
+    }
+
+    /** @overload
+
+    This constructor for pre-compiled networks. Model is imported from pre-compiled
+    blob.
+
+    @param tag string tag of the network for which these parameters are intended.
+    @param blob_path path to the compiled model (*.blob).
+    @param device target device to use.
+    */
+    Params(const std::string &tag,
+           const std::string &blob_path,
+           const std::string &device)
+        : m_tag(tag),
+          m_desc( detail::ParamDesc::Kind{detail::ParamDesc::CompiledModel{blob_path}}
+                , device
+                , true /* is generic */
+                , 0u
+                , 0u) {
+    }
+
+    /** @see ov::Params::cfgPluginConfig. */
+    Params& cfgPluginConfig(const detail::ParamDesc::PluginConfigT &config) {
+        m_desc.config = config;
+        return *this;
+    }
+
+    /** @see ov::Params::cfgInputTensorLayout. */
+    Params& cfgInputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input tensor layout")
+            .input_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgInputModelLayout. */
+    Params& cfgInputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgInputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "input model layout")
+            .input_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputTensorLayout. */
+    Params& cfgOutputTensorLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputTensorLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor layout")
+            .output_tensor_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputModelLayout. */
+    Params& cfgOutputModelLayout(std::string layout) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputModelLayout(detail::AttrMap<std::string> layout_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output model layout")
+            .output_model_layout = std::move(layout_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgOutputTensorPrecision. */
+    Params& cfgOutputTensorPrecision(int precision) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = precision;
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgOutputTensorPrecision(detail::AttrMap<int> precision_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "output tensor precision")
+            .output_tensor_precision = std::move(precision_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgReshape. */
+    Params& cfgReshape(std::vector<size_t> new_shape) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape);
+        return *this;
+    }
+
+    /** @overload */
+    Params&
+    cfgReshape(detail::AttrMap<std::vector<size_t>> new_shape_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
+            .new_shapes = std::move(new_shape_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgNumRequests. */
+    Params& cfgNumRequests(const size_t nireq) {
+        if (nireq == 0) {
+            cv::util::throw_error(
+                    std::logic_error("Number of inference requests"
+                                     " must be greater than zero."));
+        }
+        m_desc.nireq = nireq;
+        return *this;
+    }
+
+    /** @see ov::Params::cfgMean. */
+    Params& cfgMean(std::vector<float> mean_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_values);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgMean(detail::AttrMap<std::vector<float>> mean_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "mean values")
+            .mean_values = std::move(mean_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgScale. */
+    Params& cfgScale(std::vector<float> scale_values) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_values);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgScale(detail::AttrMap<std::vector<float>> scale_map) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "scale values")
+            .scale_values = std::move(scale_map);
+        return *this;
+    }
+
+    /** @see ov::Params::cfgResize. */
+    Params& cfgResize(int interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    /** @overload */
+    Params& cfgResize(detail::AttrMap<int> interpolation) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "resize preprocessing")
+            .interpolation = std::move(interpolation);
+        return *this;
+    }
+
+    // BEGIN(G-API's network parametrization API)
+    GBackend      backend() const { return cv::gapi::ov::backend(); }
+    std::string   tag()     const { return m_tag; }
+    cv::util::any params()  const { return { m_desc }; }
+    // END(G-API's network parametrization API)
+
+protected:
+    std::string m_tag;
+    detail::ParamDesc m_desc;
+};
+
+} // namespace ov
+
+namespace wip { namespace ov {
+/**
+ * @brief Ask G-API OpenVINO backend to run only inference of model provided.
+ *
+ * G-API OpenVINO backend will perform only the inference of the model provided
+ * without populating input and copying back output data.
+ * This mode is used to evaluate the pure inference performance of the model without
+ * taking into account the i/o data transfer.
+ */
+struct benchmark_mode { };
+
+} // namespace ov
+} // namespace wip
+
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::wip::ov::benchmark_mode>
+    {
+        static const char* tag() { return "gapi.wip.ov.benchmark_mode"; }
+    };
+}
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_INFER_OV_HPP
diff --git a/3rdParty/opencv2/gapi/media.hpp b/3rdParty/opencv2/gapi/media.hpp
index 621fc60f2d..c959832ef0 100644
--- a/3rdParty/opencv2/gapi/media.hpp
+++ b/3rdParty/opencv2/gapi/media.hpp
@@ -33,6 +33,7 @@ namespace cv {
  * @brief Extra G-API data structures used to pass input/output data
  * to the graph for processing.
  */
+
 /**
  * @brief cv::MediaFrame class represents an image/media frame
  * obtained from an external source.
@@ -242,11 +243,11 @@ class GAPI_EXPORTS MediaFrame::IAdapter {
     // The default implementation does nothing
     virtual cv::util::any blobParams() const;
     virtual void serialize(cv::gapi::s11n::IOStream&) {
-        GAPI_Assert(false && "Generic serialize method of MediaFrame::IAdapter does nothing by default. "
+        GAPI_Error("Generic serialize method of MediaFrame::IAdapter does nothing by default. "
                              "Please, implement it in derived class to properly serialize the object.");
     }
     virtual void deserialize(cv::gapi::s11n::IIStream&) {
-        GAPI_Assert(false && "Generic deserialize method of MediaFrame::IAdapter does nothing by default. "
+        GAPI_Error("Generic deserialize method of MediaFrame::IAdapter does nothing by default. "
                              "Please, implement it in derived class to properly deserialize the object.");
     }
 };
diff --git a/3rdParty/opencv2/gapi/ocl/goclkernel.hpp b/3rdParty/opencv2/gapi/ocl/goclkernel.hpp
index 2233f474c3..5a0e04ac49 100644
--- a/3rdParty/opencv2/gapi/ocl/goclkernel.hpp
+++ b/3rdParty/opencv2/gapi/ocl/goclkernel.hpp
@@ -119,6 +119,10 @@ template<typename U> struct ocl_get_in<cv::GArray<U> >
 {
     static const std::vector<U>& get(GOCLContext &ctx, int idx) { return ctx.inArg<VectorRef>(idx).rref<U>(); }
 };
+template<> struct ocl_get_in<cv::GFrame>
+{
+    static cv::MediaFrame get(GOCLContext &ctx, int idx) { return ctx.inArg<cv::MediaFrame>(idx); }
+};
 template<typename U> struct ocl_get_in<cv::GOpaque<U> >
 {
     static const U& get(GOCLContext &ctx, int idx) { return ctx.inArg<OpaqueRef>(idx).rref<U>(); }
@@ -149,6 +153,10 @@ struct tracked_cv_umat{
     }
 };
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4702)  // unreachable code
+#endif
 template<typename... Outputs>
 void postprocess_ocl(Outputs&... outs)
 {
@@ -162,6 +170,9 @@ void postprocess_ocl(Outputs&... outs)
     int dummy[] = { 0, (validate(&outs), 0)... };
     cv::util::suppress_unused_warning(dummy);
 }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 
 template<class T> struct ocl_get_out;
 template<> struct ocl_get_out<cv::GMat>
diff --git a/3rdParty/opencv2/gapi/opencv_includes.hpp b/3rdParty/opencv2/gapi/opencv_includes.hpp
index 1b9199d7c0..57b8c857e8 100644
--- a/3rdParty/opencv2/gapi/opencv_includes.hpp
+++ b/3rdParty/opencv2/gapi/opencv_includes.hpp
@@ -31,6 +31,7 @@ namespace cv {
     using Size    = gapi::own::Size;
     using Point   = gapi::own::Point;
     using Point2f = gapi::own::Point2f;
+    using Point3f = gapi::own::Point3f;
     using Scalar  = gapi::own::Scalar;
     using Mat     = gapi::own::Mat;
 }  // namespace cv
diff --git a/3rdParty/opencv2/gapi/ot.hpp b/3rdParty/opencv2/gapi/ot.hpp
new file mode 100644
index 0000000000..b73d7e6ee0
--- /dev/null
+++ b/3rdParty/opencv2/gapi/ot.hpp
@@ -0,0 +1,194 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_OT_HPP
+#define OPENCV_GAPI_OT_HPP
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/s11n.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace cv {
+namespace gapi {
+/**
+ * @brief This namespace contains G-API Operation Types for
+ * VAS Object Tracking module functionality.
+ */
+namespace ot {
+
+/**
+ * @enum TrackingStatus
+ *
+ * Tracking status twin for vas::ot::TrackingStatus
+ */
+enum TrackingStatus
+{
+    NEW = 0,     /**< The object is newly added. */
+    TRACKED,     /**< The object is being tracked. */
+    LOST         /**< The object gets lost now. The object can be tracked again
+                      by specifying detected object manually. */
+};
+
+struct GAPI_EXPORTS_W_SIMPLE ObjectTrackerParams
+{
+    /**
+     * Maximum number of trackable objects in a frame.
+     * Valid range: 1 <= max_num_objects. Or it can be -1 if there is no limitation
+     * of maximum number in X86. KMB/TBH has limitation up to 1024.
+     * Default value is -1 which means there is no limitation in X86. KMB/TBH is -1 means 200.
+     */
+    GAPI_PROP_RW int32_t max_num_objects = -1;
+
+    /**
+     * Input color format. Supports 0(BGR), 1(NV12), 2(BGRX) and 4(I420)
+     */
+    GAPI_PROP_RW int32_t input_image_format = 0;
+
+    /**
+     * Specifies whether tracker to use detection class for keeping id of an object.
+     * If it is true, new detection will be associated from previous tracking only when
+     * those two have same class.
+     * class id of an object is fixed across video frames.
+     * If it is false, new detection can be associated across different-class objects.
+     * In this case, the class id of an object may change across video frames depending on the tracker input.
+     * It is recommended to turn this option off when it is likely that detector confuses the class of object.
+     * For example, when detector confuses bicycle and motorbike. Turning this option off will increase
+     * the tracking reliability as tracker will ignore the class label of detector.
+     * @n
+     * Default value is true.
+     */
+    GAPI_PROP_RW bool tracking_per_class = true;
+
+    bool operator==(const ObjectTrackerParams& other) const
+    {
+        return max_num_objects == other.max_num_objects
+            && input_image_format == other.input_image_format
+            && tracking_per_class == other.tracking_per_class;
+    }
+};
+
+using GTrackedInfo = std::tuple<cv::GArray<cv::Rect>, cv::GArray<int32_t>, cv::GArray<uint64_t>, cv::GArray<int>>;
+
+G_API_OP(GTrackFromMat, <GTrackedInfo(cv::GMat, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_mat")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GMatDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+        return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                               cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+G_API_OP(GTrackFromFrame, <GTrackedInfo(cv::GFrame, cv::GArray<cv::Rect>, cv::GArray<int32_t>, float)>, "com.intel.track_from_frame")
+{
+    static std::tuple<cv::GArrayDesc, cv::GArrayDesc,
+                      cv::GArrayDesc, cv::GArrayDesc> outMeta(cv::GFrameDesc, cv::GArrayDesc, cv::GArrayDesc, float)
+    {
+       return std::make_tuple(cv::empty_array_desc(), cv::empty_array_desc(),
+                              cv::empty_array_desc(), cv::empty_array_desc());
+    }
+};
+
+/**
+ * @brief   Tracks objects with video frames.
+ *          If a detected object is overlapped enough with one of tracked object, the tracked object's
+ *          informationis updated with the input detected object.
+ *          On the other hand, if a detected object is overlapped with none of tracked objects,
+ *          the detected object is newly added and ObjectTracker starts to track the object.
+ *          In zero term tracking type, ObjectTracker clears tracked objects in case that empty
+ *          list of detected objects is passed in.
+ *
+ * @param mat                       Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>  Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>   Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>  Array of tracking IDs for objects.
+ *                                                        Numbering sequence starts from 1.
+ *                                                        The value 0 means the tracking ID of this object has
+ *                                                        not been assigned.
+ *                                  cv::GArray<int>       Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                          cv::GArray<int>,
+                          cv::GArray<uint64_t>,
+                          cv::GArray<int>>
+    track(const cv::GMat& mat,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+
+
+/**
+   @overload
+ * @brief   Tracks objects with video frames. Overload of track(...) for frame as GFrame.
+ *
+ * @param frame                     Input frame.
+ * @param detected_rects            Detected objects rectangles in the input frame.
+ * @param detected_class_labels     Detected objects class labels in the input frame.
+ * @param delta                     Frame_delta_t Delta time between two consecutive tracking in seconds.
+ *                                  The valid range is [0.005 ~ 0.5].
+ * @return                          Tracking results of target objects.
+ * @return                          Tracking results of target objects.
+ *                                  cv::GArray<cv::Rect>          Array of rectangles for tracked objects.
+ *                                  cv::GArray<int32_t>           Array of detected objects labels.
+ *                                  cv::GArray<uint64_t>          Array of tracking IDs for objects.
+ *                                                                Numbering sequence starts from 1.
+ *                                                                The value 0 means the tracking ID of this object has
+ *                                                                not been assigned.
+ *                                  cv::GArray<int>    Array of tracking statuses for objects.
+ */
+GAPI_EXPORTS_W std::tuple<cv::GArray<cv::Rect>,
+                         cv::GArray<int>,
+                         cv::GArray<uint64_t>,
+                         cv::GArray<int>>
+    track(const cv::GFrame& frame,
+          const cv::GArray<cv::Rect>& detected_rects,
+          const cv::GArray<int>& detected_class_labels,
+          float delta);
+} // namespace ot
+} // namespace gapi
+} // namespace cv
+
+// FIXME: move to a separate file?
+namespace cv
+{
+namespace detail
+{
+template<> struct CompileArgTag<cv::gapi::ot::ObjectTrackerParams>
+{
+    static const char* tag()
+    {
+        return "cv.gapi.ot.object_tracker_params";
+    }
+};
+} // namespace detail
+
+namespace gapi
+{
+namespace s11n
+{
+namespace detail
+{
+template<> struct S11N<cv::gapi::ot::ObjectTrackerParams> {
+    static void serialize(IOStream &os, const cv::gapi::ot::ObjectTrackerParams &p) {
+        os << p. max_num_objects << p.input_image_format << p.tracking_per_class;
+    }
+    static cv::gapi::ot::ObjectTrackerParams deserialize(IIStream &is) {
+        cv::gapi::ot::ObjectTrackerParams p;
+        is >> p. max_num_objects >> p.input_image_format >> p.tracking_per_class;
+        return p;
+    }
+};
+} // namespace detail
+} // namespace s11n
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_OT_HPP
diff --git a/3rdParty/opencv2/gapi/own/assert.hpp b/3rdParty/opencv2/gapi/own/assert.hpp
index bf171699ca..9e8be33272 100644
--- a/3rdParty/opencv2/gapi/own/assert.hpp
+++ b/3rdParty/opencv2/gapi/own/assert.hpp
@@ -25,6 +25,8 @@
 #  define GAPI_DbgAssert(expr) GAPI_DbgAssertNoOp(expr)
 #endif
 
+#define GAPI_Error(msg) CV_Error(cv::Error::StsError, msg)
+
 #else
 #include <stdexcept>
 #include <sstream>
@@ -49,6 +51,10 @@ namespace detail
 #  define GAPI_DbgAssert(expr) GAPI_Assert(expr)
 #endif
 
+#define GAPI_Error(msg) { \
+    ::detail::assert_abort(msg, __LINE__, __FILE__, __func__); \
+}
+
 #endif // GAPI_STANDALONE
 
 #endif // OPENCV_GAPI_OWN_ASSERT_HPP
diff --git a/3rdParty/opencv2/gapi/own/convert.hpp b/3rdParty/opencv2/gapi/own/convert.hpp
index e9997d1df0..2380a1bbed 100644
--- a/3rdParty/opencv2/gapi/own/convert.hpp
+++ b/3rdParty/opencv2/gapi/own/convert.hpp
@@ -31,7 +31,7 @@ namespace cv
         return (m.dims == 2)
             ?  cv::gapi::own::Mat{m.rows, m.cols, m.type(), m.data, m.step}
             :  cv::gapi::own::Mat{to_own<int>(m.size), m.type(), m.data};
-    };
+    }
 
 namespace gapi
 {
diff --git a/3rdParty/opencv2/gapi/own/scalar.hpp b/3rdParty/opencv2/gapi/own/scalar.hpp
index abdda119ef..02da06ca7a 100644
--- a/3rdParty/opencv2/gapi/own/scalar.hpp
+++ b/3rdParty/opencv2/gapi/own/scalar.hpp
@@ -21,7 +21,7 @@ class GAPI_EXPORTS Scalar
 {
 public:
     Scalar() = default;
-    explicit Scalar(double v0) { val[0] = v0; };
+    explicit Scalar(double v0) { val[0] = v0; }
     Scalar(double v0, double v1, double v2 = 0, double v3 = 0)
         : val{v0, v1, v2, v3}
     {
diff --git a/3rdParty/opencv2/gapi/own/types.hpp b/3rdParty/opencv2/gapi/own/types.hpp
index bfe956f836..ef01056db1 100644
--- a/3rdParty/opencv2/gapi/own/types.hpp
+++ b/3rdParty/opencv2/gapi/own/types.hpp
@@ -43,6 +43,17 @@ class Point2f
     float y = 0.f;
 };
 
+class Point3f
+{
+public:
+    Point3f() = default;
+    Point3f(float _x, float _y, float _z) : x(_x),  y(_y), z(_z) {}
+
+    float x = 0.f;
+    float y = 0.f;
+    float z = 0.f;
+};
+
 class Rect
 {
 public:
diff --git a/3rdParty/opencv2/gapi/python/python.hpp b/3rdParty/opencv2/gapi/python/python.hpp
index b5659fa648..00d15f849c 100644
--- a/3rdParty/opencv2/gapi/python/python.hpp
+++ b/3rdParty/opencv2/gapi/python/python.hpp
@@ -31,19 +31,22 @@ struct GPythonContext
     const cv::GArgs      &ins;
     const cv::GMetaArgs  &in_metas;
     const cv::GTypesInfo &out_info;
+
+    cv::optional<cv::GArg> m_state;
 };
 
 using Impl = std::function<cv::GRunArgs(const GPythonContext&)>;
+using Setup = std::function<cv::GArg(const GMetaArgs&, const GArgs&)>;
 
 class GAPI_EXPORTS GPythonKernel
 {
 public:
     GPythonKernel() = default;
-    GPythonKernel(Impl run);
+    GPythonKernel(Impl run, Setup setup);
 
-    cv::GRunArgs operator()(const GPythonContext& ctx);
-private:
-    Impl m_run;
+    Impl  run;
+    Setup setup       = nullptr;
+    bool  is_stateful = false;
 };
 
 class GAPI_EXPORTS GPythonFunctor : public cv::gapi::GFunctor
@@ -51,7 +54,8 @@ class GAPI_EXPORTS GPythonFunctor : public cv::gapi::GFunctor
 public:
     using Meta = cv::GKernel::M;
 
-    GPythonFunctor(const char* id, const Meta &meta, const Impl& impl);
+    GPythonFunctor(const char* id, const Meta& meta, const Impl& impl,
+                   const Setup& setup = nullptr);
 
     GKernelImpl    impl()    const override;
     gapi::GBackend backend() const override;
diff --git a/3rdParty/opencv2/gapi/rmat.hpp b/3rdParty/opencv2/gapi/rmat.hpp
index e3277b967c..12c700e455 100644
--- a/3rdParty/opencv2/gapi/rmat.hpp
+++ b/3rdParty/opencv2/gapi/rmat.hpp
@@ -112,11 +112,11 @@ class GAPI_EXPORTS RMat
         // is transferred to the device when the view is destroyed
         virtual View access(Access) = 0;
         virtual void serialize(cv::gapi::s11n::IOStream&) {
-            GAPI_Assert(false && "Generic serialize method of RMat::IAdapter does nothing by default. "
+            GAPI_Error("Generic serialize method of RMat::IAdapter does nothing by default. "
                                  "Please, implement it in derived class to properly serialize the object.");
         }
         virtual void deserialize(cv::gapi::s11n::IIStream&) {
-            GAPI_Assert(false && "Generic deserialize method of RMat::IAdapter does nothing by default. "
+            GAPI_Error("Generic deserialize method of RMat::IAdapter does nothing by default. "
                                  "Please, implement it in derived class to properly deserialize the object.");
         }
     };
diff --git a/3rdParty/opencv2/gapi/s11n.hpp b/3rdParty/opencv2/gapi/s11n.hpp
index 7406fd3147..16c162e7f1 100644
--- a/3rdParty/opencv2/gapi/s11n.hpp
+++ b/3rdParty/opencv2/gapi/s11n.hpp
@@ -17,7 +17,8 @@
 #include <opencv2/gapi/util/util.hpp>
 
 // FIXME: caused by deserialize_runarg
-#if (defined _WIN32 || defined _WIN64) && defined _MSC_VER
+#if defined _MSC_VER
+#pragma warning(push)
 #pragma warning(disable: 4702)
 #endif
 
@@ -229,6 +230,9 @@ GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point &pt);
 GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Point2f &pt);
 GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point2f &pt);
 
+GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Point3f &pt);
+GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Point3f &pt);
+
 GAPI_EXPORTS IOStream& operator<< (IOStream& os, const cv::Size &sz);
 GAPI_EXPORTS IIStream& operator>> (IIStream& is,       cv::Size &sz);
 
@@ -332,8 +336,8 @@ IIStream& operator>> (IIStream& is, std::vector<T> &ts) {
 namespace detail {
 template<typename V>
 IOStream& put_v(IOStream&, const V&, std::size_t) {
-    GAPI_Assert(false && "variant>>: requested index is invalid");
-};
+    GAPI_Error("variant>>: requested index is invalid");
+}
 
 template<typename V, typename X, typename... Xs>
 IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
@@ -344,7 +348,7 @@ IOStream& put_v(IOStream& os, const V& v, std::size_t x) {
 
 template<typename V>
 IIStream& get_v(IIStream&, V&, std::size_t, std::size_t) {
-    GAPI_Assert(false && "variant<<: requested index is invalid");
+    GAPI_Error("variant<<: requested index is invalid");
 }
 
 template<typename V, typename X, typename... Xs>
@@ -420,7 +424,7 @@ static GRunArg exec(cv::gapi::s11n::IIStream& is) {
 template<typename RA>
 struct deserialize_arg_with_adapter<RA, void> {
 static GRunArg exec(cv::gapi::s11n::IIStream&) {
-    GAPI_Assert(false && "No suitable adapter class found during RMat/MediaFrame deserialization. "
+    GAPI_Error("No suitable adapter class found during RMat/MediaFrame deserialization. "
                          "Please, make sure you've passed them in cv::gapi::deserialize() template");
     return GRunArg{};
 }
@@ -502,4 +506,8 @@ cv::GRunArgs getRunArgsWithAdapters(const std::vector<char> &bytes) {
 } // namespace gapi
 } // namespace cv
 
+#if defined _MSC_VER
+#pragma warning(pop)
+#endif
+
 #endif // OPENCV_GAPI_S11N_HPP
diff --git a/3rdParty/opencv2/gapi/s11n/base.hpp b/3rdParty/opencv2/gapi/s11n/base.hpp
index 1387839481..13e8110bfc 100644
--- a/3rdParty/opencv2/gapi/s11n/base.hpp
+++ b/3rdParty/opencv2/gapi/s11n/base.hpp
@@ -52,7 +52,7 @@ struct S11N: public NotImplemented {
      * properly overload the function to use it.
      */
     static void serialize(IOStream &, const T &) {
-        GAPI_Assert(false && "No serialization routine is provided!");
+        GAPI_Error("No serialization routine is provided!");
     }
     /**
      * @brief This function allows user to deserialize their custom type.
@@ -61,7 +61,7 @@ struct S11N: public NotImplemented {
      * properly overload the function to use it.
      */
     static T deserialize(IIStream &) {
-        GAPI_Assert(false && "No deserialization routine is provided!");
+        GAPI_Error("No deserialization routine is provided!");
     }
 };
 
diff --git a/3rdParty/opencv2/gapi/stereo.hpp b/3rdParty/opencv2/gapi/stereo.hpp
index 5545974d52..fce9c37436 100644
--- a/3rdParty/opencv2/gapi/stereo.hpp
+++ b/3rdParty/opencv2/gapi/stereo.hpp
@@ -62,7 +62,7 @@ G_TYPED_KERNEL(GStereo, <GMat(GMat, GMat, const StereoOutputFormat)>, "org.openc
             case StereoOutputFormat::DISPARITY_FIXED16_12_4:
                 return left.withDepth(CV_16SC1);
             default:
-                GAPI_Assert(false && "Unknown output format!");
+                GAPI_Error("Unknown output format!");
         }
     }
 };
diff --git a/3rdParty/opencv2/gapi/streaming/cap.hpp b/3rdParty/opencv2/gapi/streaming/cap.hpp
index ca5a9a24f8..6ceb395733 100644
--- a/3rdParty/opencv2/gapi/streaming/cap.hpp
+++ b/3rdParty/opencv2/gapi/streaming/cap.hpp
@@ -22,6 +22,7 @@
  * because of this file.
  */
 #include <chrono>
+#include <map>
 
 #include <opencv2/videoio.hpp>
 #include <opencv2/gapi/garg.hpp>
@@ -47,8 +48,16 @@ namespace wip {
 class GCaptureSource: public IStreamSource
 {
 public:
-    explicit GCaptureSource(int id) : cap(id) { prep(); }
-    explicit GCaptureSource(const std::string &path) : cap(path) { prep(); }
+    explicit GCaptureSource(int id, const std::map<int, double> &properties = {})
+        : cap(id) { prep(properties); }
+
+    explicit GCaptureSource(const std::string &path,
+                            const std::map<int, double> &properties = {})
+        : cap(path) { prep(properties); }
+
+    void set(int propid, double value) {
+        cap.set(propid, value);
+    }
 
     // TODO: Add more constructor overloads to make it
     // fully compatible with VideoCapture's interface.
@@ -59,15 +68,19 @@ class GCaptureSource: public IStreamSource
     bool first_pulled = false;
     int64_t counter = 0;
 
-    void prep()
+    void prep(const std::map<int, double> &properties)
     {
+        for (const auto &it : properties) {
+            cap.set(it.first, it.second);
+        }
+
         // Prepare first frame to report its meta to engine
         // when needed
         GAPI_Assert(first.empty());
         cv::Mat tmp;
         if (!cap.read(tmp))
         {
-            GAPI_Assert(false && "Couldn't grab the very first frame");
+            GAPI_Error("Couldn't grab the very first frame");
         }
         // NOTE: Some decode/media VideoCapture backends continue
         // owning the video buffer under cv::Mat so in order to
@@ -114,15 +127,19 @@ class GCaptureSource: public IStreamSource
 };
 
 // NB: Overload for using from python
-GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const std::string& path)
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const std::string& path,
+                        const std::map<int, double>& properties = {})
 {
-    return make_src<GCaptureSource>(path);
+    return make_src<GCaptureSource>(path, properties);
 }
 
 // NB: Overload for using from python
-GAPI_EXPORTS_W cv::Ptr<IStreamSource> inline make_capture_src(const int id)
+GAPI_EXPORTS_W cv::Ptr<IStreamSource>
+inline make_capture_src(const int id,
+                        const std::map<int, double>& properties = {})
 {
-    return make_src<GCaptureSource>(id);
+    return make_src<GCaptureSource>(id, properties);
 }
 
 } // namespace wip
diff --git a/3rdParty/opencv2/gapi/streaming/desync.hpp b/3rdParty/opencv2/gapi/streaming/desync.hpp
index 37e0babdda..eebb9d8328 100644
--- a/3rdParty/opencv2/gapi/streaming/desync.hpp
+++ b/3rdParty/opencv2/gapi/streaming/desync.hpp
@@ -46,6 +46,7 @@ G desync(const G &g) {
         , {cv::detail::GTypeTraits<G>::shape}               // output Shape
         , {cv::detail::GTypeTraits<G>::op_kind}             // input data kinds
         , {cv::detail::GObtainCtor<G>::get()}               // output template ctors
+        , {cv::detail::GTypeTraits<G>::op_kind}             // output data kinds
     };
     cv::GCall call(std::move(k));
     call.pass(g);
diff --git a/3rdParty/opencv2/gapi/streaming/meta.hpp b/3rdParty/opencv2/gapi/streaming/meta.hpp
index 6ea20d7a6d..5cae3b4938 100644
--- a/3rdParty/opencv2/gapi/streaming/meta.hpp
+++ b/3rdParty/opencv2/gapi/streaming/meta.hpp
@@ -50,6 +50,7 @@ cv::GOpaque<T> meta(G g, const std::string &tag) {
         , {cv::detail::GTypeTraits<O>::shape}    // output Shape
         , {cv::detail::GTypeTraits<G>::op_kind}  // input data kinds
         , {cv::detail::GObtainCtor<O>::get()}    // output template ctors
+        , {cv::detail::GTypeTraits<O>::op_kind}  // output data kind
     };
     cv::GCall call(std::move(k));
     call.pass(g);
diff --git a/3rdParty/opencv2/gapi/streaming/onevpl/accel_types.hpp b/3rdParty/opencv2/gapi/streaming/onevpl/accel_types.hpp
index 9fa68eac73..0f850198b6 100644
--- a/3rdParty/opencv2/gapi/streaming/onevpl/accel_types.hpp
+++ b/3rdParty/opencv2/gapi/streaming/onevpl/accel_types.hpp
@@ -65,6 +65,9 @@ GAPI_EXPORTS Device create_dx11_device(Device::Ptr device_ptr,
                                        const std::string& device_name);
 GAPI_EXPORTS Context create_dx11_context(Context::Ptr ctx_ptr);
 
+GAPI_EXPORTS Device create_vaapi_device(Device::Ptr device_ptr,
+                                        const std::string& device_name);
+GAPI_EXPORTS Context create_vaapi_context(Context::Ptr ctx_ptr);
 } // namespace onevpl
 } // namespace wip
 } // namespace gapi
diff --git a/3rdParty/opencv2/gapi/streaming/onevpl/cfg_params.hpp b/3rdParty/opencv2/gapi/streaming/onevpl/cfg_params.hpp
index 9f6f452a4a..b55f88e617 100644
--- a/3rdParty/opencv2/gapi/streaming/onevpl/cfg_params.hpp
+++ b/3rdParty/opencv2/gapi/streaming/onevpl/cfg_params.hpp
@@ -185,6 +185,8 @@ struct GAPI_EXPORTS CfgParam {
     const name_t& get_name() const;
     const value_t& get_value() const;
     bool is_major() const;
+    std::string to_string() const;
+
     bool operator==(const CfgParam& rhs) const;
     bool operator< (const CfgParam& rhs) const;
     bool operator!=(const CfgParam& rhs) const;
diff --git a/3rdParty/opencv2/gapi/streaming/onevpl/default.hpp b/3rdParty/opencv2/gapi/streaming/onevpl/default.hpp
new file mode 100644
index 0000000000..8b547e1aba
--- /dev/null
+++ b/3rdParty/opencv2/gapi/streaming/onevpl/default.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2022 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
+#define OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
+
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+#include <opencv2/gapi/streaming/onevpl/cfg_params.hpp>
+#include <opencv2/gapi/streaming/onevpl/device_selector_interface.hpp>
+
+namespace cv {
+namespace gapi {
+namespace wip {
+namespace onevpl {
+
+/**
+ * @brief Provides default device selector based on config.
+ */
+GAPI_EXPORTS std::shared_ptr<IDeviceSelector> getDefaultDeviceSelector(const std::vector<CfgParam>& cfg_params);
+
+} // namespace onevpl
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_ONEVPL_UTILS_HPP
diff --git a/3rdParty/opencv2/gapi/streaming/queue_source.hpp b/3rdParty/opencv2/gapi/streaming/queue_source.hpp
new file mode 100644
index 0000000000..bd385ed16e
--- /dev/null
+++ b/3rdParty/opencv2/gapi/streaming/queue_source.hpp
@@ -0,0 +1,67 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2023 Intel Corporation
+
+#ifndef OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+#define OPENCV_GAPI_STREAMING_QUEUE_SOURCE_HPP
+
+#include <memory>                      // shared_ptr
+#include <type_traits>                 // is_base_of
+
+#include <opencv2/gapi/garg.hpp>       // GRunArgs
+#include <opencv2/gapi/gmetaarg.hpp>   // GMetaArg + all descr_of
+#include <opencv2/gapi/streaming/source.hpp> // IStreamSource
+
+namespace cv {
+namespace gapi {
+namespace wip {
+struct Data; // fwd-declare to avoid circular? header dependencies
+
+class GAPI_EXPORTS QueueSourceBase: public cv::gapi::wip::IStreamSource {
+    class Priv;
+    std::shared_ptr<Priv> m_priv;
+    // FIXME: Need to understand how it works with IStreamSource's shared_from_this
+    // Can we avoid having too many shared_ptrs here?
+
+public:
+    explicit QueueSourceBase(const cv::GMetaArg &m);
+    void push(Data &&data);
+    virtual bool pull(Data &data) override;
+    virtual void halt() override;
+    virtual GMetaArg descr_of() const override;
+    virtual ~QueueSourceBase() = default;
+};
+
+/**
+ * @brief Queued streaming pipeline source.
+ *
+ */
+template<class T>
+class QueueSource final: public QueueSourceBase
+{
+public:
+    using Meta = decltype(cv::descr_of(T{}));
+    explicit QueueSource(Meta m) : QueueSourceBase(GMetaArg{m}) {
+    }
+    void push(T t) {
+        QueueSourceBase::push(Data{t});
+    }
+};
+
+class GAPI_EXPORTS QueueInput {
+    std::vector<std::shared_ptr<QueueSourceBase> > m_sources;
+
+public:
+    explicit QueueInput(const cv::GMetaArgs &args);
+
+    void push(cv::GRunArgs &&ins);
+    operator cv::GRunArgs();
+};
+
+} // namespace wip
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_STREAMING_SOURCE_HPP
diff --git a/3rdParty/opencv2/gapi/streaming/source.hpp b/3rdParty/opencv2/gapi/streaming/source.hpp
index 57869d76f1..9b02f03ffc 100644
--- a/3rdParty/opencv2/gapi/streaming/source.hpp
+++ b/3rdParty/opencv2/gapi/streaming/source.hpp
@@ -16,7 +16,7 @@
 namespace cv {
 namespace gapi {
 namespace wip {
-    struct Data; // "forward-declaration" of GRunArg
+struct Data; // forward-declaration of Data to avoid circular dependencies
 
 /**
  * @brief Abstract streaming pipeline source.
@@ -43,6 +43,11 @@ class IStreamSource: public std::enable_shared_from_this<IStreamSource>
     Ptr ptr() { return shared_from_this(); }
     virtual bool pull(Data &data) = 0;
     virtual GMetaArg descr_of() const = 0;
+    virtual void halt() {
+        // Do nothing by default to maintain compatibility with the existing sources...
+        // In fact needs to be decorated atop of the child classes to maintain the behavior
+        // FIXME: Make it mandatory in OpenCV 5.0
+    };
     virtual ~IStreamSource() = default;
 };
 
diff --git a/3rdParty/opencv2/gapi/util/variant.hpp b/3rdParty/opencv2/gapi/util/variant.hpp
index 45d48c7ab7..8b68e79ec5 100644
--- a/3rdParty/opencv2/gapi/util/variant.hpp
+++ b/3rdParty/opencv2/gapi/util/variant.hpp
@@ -509,6 +509,11 @@ namespace util
         return v.index() == util::variant<Types...>::template index_of<T>();
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
     template<typename... Us> bool operator==(const variant<Us...> &lhs,
                                              const variant<Us...> &rhs)
     {
@@ -524,6 +529,10 @@ namespace util
         return (eqs[lhs.index()])(lhs.memory, rhs.memory);
     }
 
+#if defined(__GNUC__) && (__GNUC__ == 11 || __GNUC__ == 12)
+#pragma GCC diagnostic pop
+#endif
+
     template<typename... Us> bool operator!=(const variant<Us...> &lhs,
                                              const variant<Us...> &rhs)
     {
diff --git a/3rdParty/opencv2/highgui.hpp b/3rdParty/opencv2/highgui.hpp
index 0154df4ade..81437f4ef1 100644
--- a/3rdParty/opencv2/highgui.hpp
+++ b/3rdParty/opencv2/highgui.hpp
@@ -85,50 +85,8 @@ It provides easy interface to:
         created. Then, a new button is attached to it.
 
     See below the example used to generate the figure:
-    @code
-        int main(int argc, char *argv[])
-        {
-
-            int value = 50;
-            int value2 = 0;
-
-
-            namedWindow("main1",WINDOW_NORMAL);
-            namedWindow("main2",WINDOW_AUTOSIZE | WINDOW_GUI_NORMAL);
-            createTrackbar( "track1", "main1", &value, 255,  NULL);
-
-            String nameb1 = "button1";
-            String nameb2 = "button2";
-
-            createButton(nameb1,callbackButton,&nameb1,QT_CHECKBOX,1);
-            createButton(nameb2,callbackButton,NULL,QT_CHECKBOX,0);
-            createTrackbar( "track2", NULL, &value2, 255, NULL);
-            createButton("button5",callbackButton1,NULL,QT_RADIOBOX,0);
-            createButton("button6",callbackButton2,NULL,QT_RADIOBOX,1);
-
-            setMouseCallback( "main2",on_mouse,NULL );
-
-            Mat img1 = imread("files/flower.jpg");
-            VideoCapture video;
-            video.open("files/hockey.avi");
-
-            Mat img2,img3;
-
-            while( waitKey(33) != 27 )
-            {
-                img1.convertTo(img2,-1,1,value);
-                video >> img3;
-
-                imshow("main1",img2);
-                imshow("main2",img3);
-            }
-
-            destroyAllWindows();
-
-            return 0;
-        }
-    @endcode
 
+    @include highgui_qt.cpp
 
     @defgroup highgui_winrt WinRT support
 
@@ -139,36 +97,34 @@ It provides easy interface to:
 
     See below the example used to generate the figure:
     @code
-        void sample_app::MainPage::ShowWindow()
+    void sample_app::MainPage::ShowWindow()
+    {
+        static cv::String windowName("sample");
+        cv::winrt_initContainer(this->cvContainer);
+        cv::namedWindow(windowName); // not required
+
+        cv::Mat image = cv::imread("Assets/sample.jpg");
+        cv::Mat converted = cv::Mat(image.rows, image.cols, CV_8UC4);
+        cv::cvtColor(image, converted, COLOR_BGR2BGRA);
+        cv::imshow(windowName, converted); // this will create window if it hasn't been created before
+
+        int state = 42;
+        cv::TrackbarCallback callback = [](int pos, void* userdata)
         {
-            static cv::String windowName("sample");
-            cv::winrt_initContainer(this->cvContainer);
-            cv::namedWindow(windowName); // not required
-
-            cv::Mat image = cv::imread("Assets/sample.jpg");
-            cv::Mat converted = cv::Mat(image.rows, image.cols, CV_8UC4);
-            cv::cvtColor(image, converted, COLOR_BGR2BGRA);
-            cv::imshow(windowName, converted); // this will create window if it hasn't been created before
-
-            int state = 42;
-            cv::TrackbarCallback callback = [](int pos, void* userdata)
-            {
-                if (pos == 0) {
-                    cv::destroyWindow(windowName);
-                }
-            };
-            cv::TrackbarCallback callbackTwin = [](int pos, void* userdata)
-            {
-                if (pos >= 70) {
-                    cv::destroyAllWindows();
-                }
-            };
-            cv::createTrackbar("Sample trackbar", windowName, &state, 100, callback);
-            cv::createTrackbar("Twin brother", windowName, &state, 100, callbackTwin);
-        }
+            if (pos == 0) {
+                cv::destroyWindow(windowName);
+            }
+        };
+        cv::TrackbarCallback callbackTwin = [](int pos, void* userdata)
+        {
+            if (pos >= 70) {
+                cv::destroyAllWindows();
+            }
+        };
+        cv::createTrackbar("Sample trackbar", windowName, &state, 100, callback);
+        cv::createTrackbar("Twin brother", windowName, &state, 100, callbackTwin);
+    }
     @endcode
-
-    @defgroup highgui_c C API
 @}
 */
 
@@ -300,9 +256,7 @@ You can call cv::destroyWindow or cv::destroyAllWindows to close the window and
 memory usage. For a simple program, you do not really have to call these functions because all the
 resources and windows of the application are closed automatically by the operating system upon exit.
 
-@note
-
-Qt backend supports additional flags:
+@note Qt backend supports additional flags:
  -   **WINDOW_NORMAL or WINDOW_AUTOSIZE:** WINDOW_NORMAL enables you to resize the
      window, whereas WINDOW_AUTOSIZE adjusts automatically the window size to fit the
      displayed image (see imshow ), and you cannot change the window size manually.
@@ -331,13 +285,20 @@ The function destroyAllWindows destroys all of the opened HighGUI windows.
  */
 CV_EXPORTS_W void destroyAllWindows();
 
+
+/** @brief HighGUI backend used.
+
+The function returns HighGUI backend name used: could be COCOA, GTK2/3, QT, WAYLAND or WIN32.
+Returns empty string if there is no available UI backend.
+ */
+CV_EXPORTS_W const std::string currentUIFramework();
+
+
 CV_EXPORTS_W int startWindowThread();
 
 /** @brief Similar to #waitKey, but returns full key code.
 
-@note
-
-Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
+@note Key code is implementation specific and depends on used backend: QT/GTK/Win32/etc
 
 */
 CV_EXPORTS_W int waitKeyEx(int delay = 0);
@@ -404,11 +365,12 @@ For example, **waitKey(0)** will display the window infinitely until any keypres
 for image display). **waitKey(25)** will display a frame and wait approximately 25 ms for a key
 press (suitable for displaying a video frame-by-frame). To remove the window, use cv::destroyWindow.
 
-@note
-
-[__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard.
-
-[__Windows Backend Only__] Pressing Ctrl+S will show a dialog to save the image.
+@note [__Windows Backend Only__] Pressing Ctrl+C will copy the image to the clipboard. Pressing Ctrl+S will show a dialog to save the image.
+@note [__Wayland Backend Only__] Supoorting format is extended.
+-   If the image is 8-bit signed, the pixels are biased by 128. That is, the
+    value range [-128,127] is mapped to [0,255].
+-   If the image is 16-bit signed, the pixels are divided by 256 and biased by 128. That is, the
+    value range [-32768,32767] is mapped to [0,255].
 
 @param winname Name of the window.
 @param mat Image to be shown.
@@ -417,10 +379,8 @@ CV_EXPORTS_W void imshow(const String& winname, InputArray mat);
 
 /** @brief Resizes the window to the specified size
 
-@note
-
--   The specified window size is for the image area. Toolbars are not counted.
--   Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
+@note The specified window size is for the image area. Toolbars are not counted.
+Only windows created without cv::WINDOW_AUTOSIZE flag can be resized.
 
 @param winname Window name.
 @param width The new window width.
@@ -439,6 +399,8 @@ CV_EXPORTS_W void resizeWindow(const String& winname, const cv::Size& size);
 @param winname Name of the window.
 @param x The new x-coordinate of the window.
 @param y The new y-coordinate of the window.
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
  */
 CV_EXPORTS_W void moveWindow(const String& winname, int x, int y);
 
@@ -449,6 +411,8 @@ The function setWindowProperty enables changing properties of a window.
 @param winname Name of the window.
 @param prop_id Window property to edit. The supported operation flags are: (cv::WindowPropertyFlags)
 @param prop_value New value of the window property. The supported flags are: (cv::WindowFlags)
+
+@note [__Wayland Backend Only__] This function is not supported.
  */
 CV_EXPORTS_W void setWindowProperty(const String& winname, int prop_id, double prop_value);
 
@@ -466,6 +430,8 @@ The function getWindowProperty returns properties of a window.
 @param prop_id Window property to retrieve. The following operation flags are available: (cv::WindowPropertyFlags)
 
 @sa setWindowProperty
+
+@note [__Wayland Backend Only__] This function is not supported.
  */
 CV_EXPORTS_W double getWindowProperty(const String& winname, int prop_id);
 
@@ -476,6 +442,8 @@ The function getWindowImageRect returns the client screen coordinates, width and
 @param winname Name of the window.
 
 @sa resizeWindow moveWindow
+
+@note [__Wayland Backend Only__] This function is not supported by the Wayland protocol limitation.
  */
 CV_EXPORTS_W Rect getWindowImageRect(const String& winname);
 
@@ -502,9 +470,7 @@ For cv::EVENT_MOUSEWHEEL positive and negative values mean forward and backward
 respectively. For cv::EVENT_MOUSEHWHEEL, where available, positive and negative values mean right and
 left scrolling, respectively.
 
-@note
-
-Mouse-wheel events are currently supported only on Windows.
+@note Mouse-wheel events are currently supported only on Windows and Cocoa.
 
 @param flags The mouse callback flags parameter.
  */
@@ -520,16 +486,17 @@ Controls: use `space` or `enter` to finish selection, use key `c` to cancel sele
 @param showCrosshair if true crosshair of selection rectangle will be shown.
 @param fromCenter if true center of selection will match initial mouse position. In opposite case a corner of
 selection rectangle will correspont to the initial mouse position.
+@param printNotice if true a notice to select ROI or cancel selection will be printed in console.
 @return selected ROI or empty rect if selection canceled.
 
 @note The function sets it's own mouse callback for specified window using cv::setMouseCallback(windowName, ...).
 After finish of work an empty callback will be set for the used window.
  */
-CV_EXPORTS_W Rect selectROI(const String& windowName, InputArray img, bool showCrosshair = true, bool fromCenter = false);
+CV_EXPORTS_W Rect selectROI(const String& windowName, InputArray img, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
 
 /** @overload
  */
-CV_EXPORTS_W Rect selectROI(InputArray img, bool showCrosshair = true, bool fromCenter = false);
+CV_EXPORTS_W Rect selectROI(InputArray img, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
 
 /** @brief Allows users to select multiple ROIs on the given image.
 
@@ -543,12 +510,13 @@ use `esc` to terminate multiple ROI selection process.
 @param showCrosshair if true crosshair of selection rectangle will be shown.
 @param fromCenter if true center of selection will match initial mouse position. In opposite case a corner of
 selection rectangle will correspont to the initial mouse position.
+@param printNotice if true a notice to select ROI or cancel selection will be printed in console.
 
 @note The function sets it's own mouse callback for specified window using cv::setMouseCallback(windowName, ...).
 After finish of work an empty callback will be set for the used window.
  */
 CV_EXPORTS_W void selectROIs(const String& windowName, InputArray img,
-                             CV_OUT std::vector<Rect>& boundingBoxes, bool showCrosshair = true, bool fromCenter = false);
+                             CV_OUT std::vector<Rect>& boundingBoxes, bool showCrosshair = true, bool fromCenter = false, bool printNotice = true);
 
 /** @brief Creates a trackbar and attaches it to the specified window.
 
@@ -557,24 +525,26 @@ and range, assigns a variable value to be a position synchronized with the track
 the callback function onChange to be called on the trackbar position change. The created trackbar is
 displayed in the specified window winname.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
+@note [__Qt Backend Only__] winname can be empty if the trackbar should be attached to the
 control panel.
 
 Clicking the label of each trackbar enables editing the trackbar values manually.
 
 @param trackbarname Name of the created trackbar.
-@param winname Name of the window that will be used as a parent of the created trackbar.
-@param value Optional pointer to an integer variable whose value reflects the position of the
-slider. Upon creation, the slider position is defined by this variable.
-@param count Maximal position of the slider. The minimal position is always 0.
-@param onChange Pointer to the function to be called every time the slider changes position. This
-function should be prototyped as void Foo(int,void\*); , where the first parameter is the trackbar
-position and the second parameter is the user data (see the next parameter). If the callback is
-the NULL pointer, no callbacks are called, but only value is updated.
-@param userdata User data that is passed as is to the callback. It can be used to handle trackbar
-events without using global variables.
+@param winname Name of the window that will contain the trackbar.
+@param value Pointer to the integer value that will be changed by the trackbar.
+Pass `nullptr` if the value pointer is not used. In this case, manually handle
+the trackbar position in the callback function.
+@param count Maximum position of the trackbar.
+@param onChange Pointer to the function to be called every time the slider changes position.
+This function should have the prototype void Foo(int, void\*);, where the first parameter is
+the trackbar position, and the second parameter is the user data (see the next parameter).
+If the callback is a nullptr, no callbacks are called, but the trackbar's value will still be
+updated automatically.
+@param userdata Optional user data that is passed to the callback.
+@note If the `value` pointer is `nullptr`, the trackbar position must be manually managed.
+Call the callback function manually with the desired initial value to avoid runtime warnings.
+@see \ref tutorial_trackbar
  */
 CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
                               int* value, int count,
@@ -585,9 +555,7 @@ CV_EXPORTS int createTrackbar(const String& trackbarname, const String& winname,
 
 The function returns the current position of the specified trackbar.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -599,9 +567,7 @@ CV_EXPORTS_W int getTrackbarPos(const String& trackbarname, const String& winnam
 
 The function sets the position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -614,9 +580,7 @@ CV_EXPORTS_W void setTrackbarPos(const String& trackbarname, const String& winna
 
 The function sets the maximum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
@@ -629,9 +593,7 @@ CV_EXPORTS_W void setTrackbarMax(const String& trackbarname, const String& winna
 
 The function sets the minimum position of the specified trackbar in the specified window.
 
-@note
-
-[__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
+@note [__Qt Backend Only__] winname can be empty if the trackbar is attached to the control
 panel.
 
 @param trackbarname Name of the trackbar.
diff --git a/3rdParty/opencv2/imgcodecs.hpp b/3rdParty/opencv2/imgcodecs.hpp
index 4a7b9521e5..4f12c18173 100644
--- a/3rdParty/opencv2/imgcodecs.hpp
+++ b/3rdParty/opencv2/imgcodecs.hpp
@@ -48,7 +48,6 @@
 /**
   @defgroup imgcodecs Image file reading and writing
   @{
-    @defgroup imgcodecs_c C API
     @defgroup imgcodecs_flags Flags used for image file reading and writing
     @defgroup imgcodecs_ios iOS glue
     @defgroup imgcodecs_macosx MacOS(OSX) glue
@@ -69,7 +68,8 @@ namespace cv
 enum ImreadModes {
        IMREAD_UNCHANGED            = -1, //!< If set, return the loaded image as is (with alpha channel, otherwise it gets cropped). Ignore EXIF orientation.
        IMREAD_GRAYSCALE            = 0,  //!< If set, always convert image to the single channel grayscale image (codec internal conversion).
-       IMREAD_COLOR                = 1,  //!< If set, always convert image to the 3 channel BGR color image.
+       IMREAD_COLOR_BGR            = 1,  //!< If set, always convert image to the 3 channel BGR color image.
+       IMREAD_COLOR                = 1,  //!< Same as IMREAD_COLOR_BGR.
        IMREAD_ANYDEPTH             = 2,  //!< If set, return 16-bit/32-bit image when the input has the corresponding depth, otherwise convert it to 8-bit.
        IMREAD_ANYCOLOR             = 4,  //!< If set, the image is read in any possible color format.
        IMREAD_LOAD_GDAL            = 8,  //!< If set, use the gdal driver for loading the image.
@@ -79,7 +79,8 @@ enum ImreadModes {
        IMREAD_REDUCED_COLOR_4      = 33, //!< If set, always convert image to the 3 channel BGR color image and the image size reduced 1/4.
        IMREAD_REDUCED_GRAYSCALE_8  = 64, //!< If set, always convert image to the single channel grayscale image and the image size reduced 1/8.
        IMREAD_REDUCED_COLOR_8      = 65, //!< If set, always convert image to the 3 channel BGR color image and the image size reduced 1/8.
-       IMREAD_IGNORE_ORIENTATION   = 128 //!< If set, do not rotate the image according to EXIF's orientation flag.
+       IMREAD_IGNORE_ORIENTATION   = 128, //!< If set, do not rotate the image according to EXIF's orientation flag.
+       IMREAD_COLOR_RGB            = 256, //!< If set, always convert image to the 3 channel RGB color image.
      };
 
 //! Imwrite flags
@@ -88,23 +89,92 @@ enum ImwriteFlags {
        IMWRITE_JPEG_PROGRESSIVE    = 2,  //!< Enable JPEG features, 0 or 1, default is False.
        IMWRITE_JPEG_OPTIMIZE       = 3,  //!< Enable JPEG features, 0 or 1, default is False.
        IMWRITE_JPEG_RST_INTERVAL   = 4,  //!< JPEG restart interval, 0 - 65535, default is 0 - no restart.
-       IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use.
-       IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use.
+       IMWRITE_JPEG_LUMA_QUALITY   = 5,  //!< Separate luma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
+       IMWRITE_JPEG_CHROMA_QUALITY = 6,  //!< Separate chroma quality level, 0 - 100, default is -1 - don't use. If JPEG_LIB_VERSION < 70, Not supported.
+       IMWRITE_JPEG_SAMPLING_FACTOR = 7, //!< For JPEG, set sampling factor. See cv::ImwriteJPEGSamplingFactorParams.
        IMWRITE_PNG_COMPRESSION     = 16, //!< For PNG, it can be the compression level from 0 to 9. A higher value means a smaller size and longer compression time. If specified, strategy is changed to IMWRITE_PNG_STRATEGY_DEFAULT (Z_DEFAULT_STRATEGY). Default value is 1 (best speed setting).
        IMWRITE_PNG_STRATEGY        = 17, //!< One of cv::ImwritePNGFlags, default is IMWRITE_PNG_STRATEGY_RLE.
        IMWRITE_PNG_BILEVEL         = 18, //!< Binary level PNG, 0 or 1, default is 0.
        IMWRITE_PXM_BINARY          = 32, //!< For PPM, PGM, or PBM, it can be a binary format flag, 0 or 1. Default value is 1.
-       IMWRITE_EXR_TYPE            = (3 << 4) + 0, /* 48 */ //!< override EXR storage type (FLOAT (FP32) is default)
-       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1, /* 49 */ //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
+       IMWRITE_EXR_TYPE            = (3 << 4) + 0 /* 48 */, //!< override EXR storage type (FLOAT (FP32) is default)
+       IMWRITE_EXR_COMPRESSION     = (3 << 4) + 1 /* 49 */, //!< override EXR compression type (ZIP_COMPRESSION = 3 is default)
+       IMWRITE_EXR_DWA_COMPRESSION_LEVEL = (3 << 4) + 2 /* 50 */, //!< override EXR DWA compression level (45 is default)
        IMWRITE_WEBP_QUALITY        = 64, //!< For WEBP, it can be a quality from 1 to 100 (the higher is the better). By default (without any parameter) and for quality above 100 the lossless compression is used.
+       IMWRITE_HDR_COMPRESSION     = (5 << 4) + 0 /* 80 */, //!< specify HDR compression
        IMWRITE_PAM_TUPLETYPE       = 128,//!< For PAM, sets the TUPLETYPE field to the corresponding string value that is defined for the format
        IMWRITE_TIFF_RESUNIT        = 256,//!< For TIFF, use to specify which DPI resolution unit to set; see libtiff documentation for valid values
        IMWRITE_TIFF_XDPI           = 257,//!< For TIFF, use to specify the X direction DPI
        IMWRITE_TIFF_YDPI           = 258,//!< For TIFF, use to specify the Y direction DPI
-       IMWRITE_TIFF_COMPRESSION    = 259,//!< For TIFF, use to specify the image compression scheme. See libtiff for integer constants corresponding to compression formats. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
-       IMWRITE_JPEG2000_COMPRESSION_X1000 = 272 //!< For JPEG2000, use to specify the target compression rate (multiplied by 1000). The value can be from 0 to 1000. Default is 1000.
+       IMWRITE_TIFF_COMPRESSION    = 259,//!< For TIFF, use to specify the image compression scheme. See cv::ImwriteTiffCompressionFlags. Note, for images whose depth is CV_32F, only libtiff's SGILOG compression scheme is used. For other supported depths, the compression scheme can be specified by this flag; LZW compression is the default.
+       IMWRITE_TIFF_ROWSPERSTRIP   = 278,//!< For TIFF, use to specify the number of rows per strip.
+       IMWRITE_TIFF_PREDICTOR      = 317,//!< For TIFF, use to specify predictor. See cv::ImwriteTiffPredictorFlags.
+       IMWRITE_JPEG2000_COMPRESSION_X1000 = 272,//!< For JPEG2000, use to specify the target compression rate (multiplied by 1000). The value can be from 0 to 1000. Default is 1000.
+       IMWRITE_AVIF_QUALITY        = 512,//!< For AVIF, it can be a quality between 0 and 100 (the higher the better). Default is 95.
+       IMWRITE_AVIF_DEPTH          = 513,//!< For AVIF, it can be 8, 10 or 12. If >8, it is stored/read as CV_32F. Default is 8.
+       IMWRITE_AVIF_SPEED          = 514,//!< For AVIF, it is between 0 (slowest) and (fastest). Default is 9.
+       IMWRITE_JPEGXL_QUALITY      = 640,//!< For JPEG XL, it can be a quality from 0 to 100 (the higher is the better). Default value is 95. If set, distance parameter is re-calicurated from quality level automatically. This parameter request libjxl v0.10 or later.
+       IMWRITE_JPEGXL_EFFORT       = 641,//!< For JPEG XL, encoder effort/speed level without affecting decoding speed; it is between 1 (fastest) and 10 (slowest). Default is 7.
+       IMWRITE_JPEGXL_DISTANCE     = 642,//!< For JPEG XL, distance level for lossy compression: target max butteraugli distance, lower = higher quality, 0 = lossless; range: 0 .. 25. Default is 1.
+       IMWRITE_JPEGXL_DECODING_SPEED = 643,//!< For JPEG XL, decoding speed tier for the provided options; minimum is 0 (slowest to decode, best quality/density), and maximum is 4 (fastest to decode, at the cost of some quality/density). Default is 0.
+       IMWRITE_GIF_LOOP            = 1024,//!< For GIF, it can be a loop flag from 0 to 65535. Default is 0 - loop forever.
+       IMWRITE_GIF_SPEED           = 1025,//!< For GIF, it is between 1 (slowest) and 100 (fastest). Default is 96.
+       IMWRITE_GIF_QUALITY         = 1026, //!< For GIF, it can be a quality from 1 to 8. Default is 2. See cv::ImwriteGifCompressionFlags.
+       IMWRITE_GIF_DITHER          = 1027, //!< For GIF, it can be a quality from -1(most dither) to 3(no dither). Default is 0.
+       IMWRITE_GIF_TRANSPARENCY    = 1028, //!< For GIF, the alpha channel lower than this will be set to transparent. Default is 1.
+       IMWRITE_GIF_COLORTABLE      = 1029  //!< For GIF, 0 means global color table is used, 1 means local color table is used. Default is 0.
+};
+
+enum ImwriteJPEGSamplingFactorParams {
+       IMWRITE_JPEG_SAMPLING_FACTOR_411 = 0x411111, //!< 4x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_420 = 0x221111, //!< 2x2,1x1,1x1(Default)
+       IMWRITE_JPEG_SAMPLING_FACTOR_422 = 0x211111, //!< 2x1,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_440 = 0x121111, //!< 1x2,1x1,1x1
+       IMWRITE_JPEG_SAMPLING_FACTOR_444 = 0x111111  //!< 1x1,1x1,1x1(No subsampling)
      };
 
+enum ImwriteTiffCompressionFlags {
+        IMWRITE_TIFF_COMPRESSION_NONE = 1,            //!< dump mode
+        IMWRITE_TIFF_COMPRESSION_CCITTRLE = 2,        //!< CCITT modified Huffman RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX3 = 3,       //!< CCITT Group 3 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T4 = 3,        //!< CCITT T.4 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_CCITTFAX4 = 4,       //!< CCITT Group 4 fax encoding
+        IMWRITE_TIFF_COMPRESSION_CCITT_T6 = 4,        //!< CCITT T.6 (TIFF 6 name)
+        IMWRITE_TIFF_COMPRESSION_LZW = 5,             //!< Lempel-Ziv  & Welch
+        IMWRITE_TIFF_COMPRESSION_OJPEG = 6,           //!< !6.0 JPEG
+        IMWRITE_TIFF_COMPRESSION_JPEG = 7,            //!< %JPEG DCT compression
+        IMWRITE_TIFF_COMPRESSION_T85 = 9,             //!< !TIFF/FX T.85 JBIG compression
+        IMWRITE_TIFF_COMPRESSION_T43 = 10,            //!< !TIFF/FX T.43 colour by layered JBIG compression
+        IMWRITE_TIFF_COMPRESSION_NEXT = 32766,        //!< NeXT 2-bit RLE
+        IMWRITE_TIFF_COMPRESSION_CCITTRLEW = 32771,   //!< #1 w/ word alignment
+        IMWRITE_TIFF_COMPRESSION_PACKBITS = 32773,    //!< Macintosh RLE
+        IMWRITE_TIFF_COMPRESSION_THUNDERSCAN = 32809, //!< ThunderScan RLE
+        IMWRITE_TIFF_COMPRESSION_IT8CTPAD = 32895,    //!< IT8 CT w/padding
+        IMWRITE_TIFF_COMPRESSION_IT8LW = 32896,       //!< IT8 Linework RLE
+        IMWRITE_TIFF_COMPRESSION_IT8MP = 32897,       //!< IT8 Monochrome picture
+        IMWRITE_TIFF_COMPRESSION_IT8BL = 32898,       //!< IT8 Binary line art
+        IMWRITE_TIFF_COMPRESSION_PIXARFILM = 32908,   //!< Pixar companded 10bit LZW
+        IMWRITE_TIFF_COMPRESSION_PIXARLOG = 32909,    //!< Pixar companded 11bit ZIP
+        IMWRITE_TIFF_COMPRESSION_DEFLATE = 32946,     //!< Deflate compression, legacy tag
+        IMWRITE_TIFF_COMPRESSION_ADOBE_DEFLATE = 8,   //!< Deflate compression, as recognized by Adobe
+        IMWRITE_TIFF_COMPRESSION_DCS = 32947,         //!< Kodak DCS encoding
+        IMWRITE_TIFF_COMPRESSION_JBIG = 34661,        //!< ISO JBIG
+        IMWRITE_TIFF_COMPRESSION_SGILOG = 34676,      //!< SGI Log Luminance RLE
+        IMWRITE_TIFF_COMPRESSION_SGILOG24 = 34677,    //!< SGI Log 24-bit packed
+        IMWRITE_TIFF_COMPRESSION_JP2000 = 34712,      //!< Leadtools JPEG2000
+        IMWRITE_TIFF_COMPRESSION_LERC = 34887,        //!< ESRI Lerc codec: https://github.com/Esri/lerc
+        IMWRITE_TIFF_COMPRESSION_LZMA = 34925,        //!< LZMA2
+        IMWRITE_TIFF_COMPRESSION_ZSTD = 50000,        //!< ZSTD: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_WEBP = 50001,        //!< WEBP: WARNING not registered in Adobe-maintained registry
+        IMWRITE_TIFF_COMPRESSION_JXL = 50002          //!< JPEGXL: WARNING not registered in Adobe-maintained registry
+};
+
+enum ImwriteTiffPredictorFlags {
+        IMWRITE_TIFF_PREDICTOR_NONE = 1,              //!< no prediction scheme used
+        IMWRITE_TIFF_PREDICTOR_HORIZONTAL = 2,        //!< horizontal differencing
+        IMWRITE_TIFF_PREDICTOR_FLOATINGPOINT = 3      //!< floating point predictor
+
+};
+
 enum ImwriteEXRTypeFlags {
        /*IMWRITE_EXR_TYPE_UNIT = 0, //!< not supported */
        IMWRITE_EXR_TYPE_HALF   = 1, //!< store as HALF (FP16)
@@ -150,24 +220,74 @@ enum ImwritePAMFlags {
        IMWRITE_PAM_FORMAT_RGB_ALPHA       = 5
      };
 
+//! Imwrite HDR specific values for IMWRITE_HDR_COMPRESSION parameter key
+enum ImwriteHDRCompressionFlags {
+    IMWRITE_HDR_COMPRESSION_NONE = 0,
+    IMWRITE_HDR_COMPRESSION_RLE = 1
+};
+
+//! Imwrite GIF specific values for IMWRITE_GIF_QUALITY parameter key, if larger than 3, then its related to the size of the color table.
+enum ImwriteGIFCompressionFlags {
+    IMWRITE_GIF_FAST_NO_DITHER       = 1,
+    IMWRITE_GIF_FAST_FLOYD_DITHER    = 2,
+    IMWRITE_GIF_COLORTABLE_SIZE_8    = 3,
+    IMWRITE_GIF_COLORTABLE_SIZE_16   = 4,
+    IMWRITE_GIF_COLORTABLE_SIZE_32   = 5,
+    IMWRITE_GIF_COLORTABLE_SIZE_64   = 6,
+    IMWRITE_GIF_COLORTABLE_SIZE_128  = 7,
+    IMWRITE_GIF_COLORTABLE_SIZE_256  = 8
+};
+
 //! @} imgcodecs_flags
 
+/** @brief Represents an animation with multiple frames.
+The `Animation` struct is designed to store and manage data for animated sequences such as those from animated formats (e.g., GIF, AVIF, APNG, WebP).
+It provides support for looping, background color settings, frame timing, and frame storage.
+*/
+struct CV_EXPORTS_W_SIMPLE Animation
+{
+    //! Number of times the animation should loop. 0 means infinite looping.
+    CV_PROP_RW int loop_count;
+    //! Background color of the animation in BGRA format.
+    CV_PROP_RW Scalar bgcolor;
+    //! Duration for each frame in milliseconds.
+    CV_PROP_RW std::vector<int> durations;
+    //! Vector of frames, where each Mat represents a single frame.
+    CV_PROP_RW std::vector<Mat> frames;
+
+    /** @brief Constructs an Animation object with optional loop count and background color.
+
+    @param loopCount An integer representing the number of times the animation should loop:
+    - `0` (default) indicates infinite looping, meaning the animation will replay continuously.
+    - Positive values denote finite repeat counts, allowing the animation to play a limited number of times.
+    - If a negative value or a value beyond the maximum of `0xffff` (65535) is provided, it is reset to `0`
+    (infinite looping) to maintain valid bounds.
+
+    @param bgColor A `Scalar` object representing the background color in BGRA format:
+    - Defaults to `Scalar()`, indicating an empty color (usually transparent if supported).
+    - This background color provides a solid fill behind frames that have transparency, ensuring a consistent display appearance.
+    */
+    Animation(int loopCount = 0, Scalar bgColor = Scalar());
+};
+
 /** @brief Loads an image from a file.
 
 @anchor imread
 
-The function imread loads an image from the specified file and returns it. If the image cannot be
-read (because of missing file, improper permissions, unsupported or invalid format), the function
-returns an empty matrix ( Mat::data==NULL ).
+The `imread` function loads an image from the specified file and returns OpenCV matrix. If the image cannot be
+read (because of a missing file, improper permissions, or unsupported/invalid format), the function
+returns an empty matrix.
 
 Currently, the following file formats are supported:
 
 -   Windows bitmaps - \*.bmp, \*.dib (always supported)
+-   GIF files - \*.gif (always supported)
 -   JPEG files - \*.jpeg, \*.jpg, \*.jpe (see the *Note* section)
 -   JPEG 2000 files - \*.jp2 (see the *Note* section)
 -   Portable Network Graphics - \*.png (see the *Note* section)
 -   WebP - \*.webp (see the *Note* section)
--   Portable image format - \*.pbm, \*.pgm, \*.ppm \*.pxm, \*.pnm (always supported)
+-   AVIF - \*.avif (see the *Note* section)
+-   Portable image format - \*.pbm, \*.pgm, \*.ppm, \*.pxm, \*.pnm (always supported)
 -   PFM files - \*.pfm (see the *Note* section)
 -   Sun rasters - \*.sr, \*.ras (always supported)
 -   TIFF files - \*.tiff, \*.tif (see the *Note* section)
@@ -176,34 +296,44 @@ Currently, the following file formats are supported:
 -   Raster and Vector geospatial data supported by GDAL (see the *Note* section)
 
 @note
--   The function determines the type of an image by the content, not by the file extension.
+-   The function determines the type of an image by its content, not by the file extension.
 -   In the case of color images, the decoded images will have the channels stored in **B G R** order.
 -   When using IMREAD_GRAYSCALE, the codec's internal grayscale conversion will be used, if available.
-    Results may differ to the output of cvtColor()
--   On Microsoft Windows\* OS and MacOSX\*, the codecs shipped with an OpenCV image (libjpeg,
-    libpng, libtiff, and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs,
-    and TIFFs. On MacOSX, there is also an option to use native MacOSX image readers. But beware
-    that currently these native image loaders give images with different pixel values because of
-    the color management embedded into MacOSX.
--   On Linux\*, BSD flavors and other Unix-like open-source operating systems, OpenCV looks for
-    codecs supplied with an OS image. Install the relevant packages (do not forget the development
-    files, for example, "libjpeg-dev", in Debian\* and Ubuntu\*) to get the codec support or turn
+    Results may differ from the output of cvtColor().
+-   On Microsoft Windows\* and Mac OS\*, the codecs shipped with OpenCV (libjpeg, libpng, libtiff,
+    and libjasper) are used by default. So, OpenCV can always read JPEGs, PNGs, and TIFFs. On Mac OS,
+    there is also an option to use native Mac OS image readers. However, beware that currently these
+    native image loaders give images with different pixel values because of the color management embedded
+    into Mac OS.
+-   On Linux\*, BSD flavors, and other Unix-like open-source operating systems, OpenCV looks for
+    codecs supplied with the OS. Ensure the relevant packages are installed (including development
+    files, such as "libjpeg-dev" in Debian\* and Ubuntu\*) to get codec support, or turn
     on the OPENCV_BUILD_3RDPARTY_LIBS flag in CMake.
--   In the case you set *WITH_GDAL* flag to true in CMake and @ref IMREAD_LOAD_GDAL to load the image,
-    then the [GDAL](http://www.gdal.org) driver will be used in order to decode the image, supporting
-    the following formats: [Raster](http://www.gdal.org/formats_list.html),
-    [Vector](http://www.gdal.org/ogr_formats.html).
--   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account
-    and thus the image will be rotated accordingly except if the flags @ref IMREAD_IGNORE_ORIENTATION
+-   If the *WITH_GDAL* flag is set to true in CMake and @ref IMREAD_LOAD_GDAL is used to load the image,
+    the [GDAL](http://www.gdal.org) driver will be used to decode the image, supporting
+    [Raster](http://www.gdal.org/formats_list.html) and [Vector](http://www.gdal.org/ogr_formats.html) formats.
+-   If EXIF information is embedded in the image file, the EXIF orientation will be taken into account,
+    and thus the image will be rotated accordingly unless the flags @ref IMREAD_IGNORE_ORIENTATION
     or @ref IMREAD_UNCHANGED are passed.
--   Use the IMREAD_UNCHANGED flag to keep the floating point values from PFM image.
--   By default number of pixels must be less than 2^30. Limit can be set using system
-    variable OPENCV_IO_MAX_IMAGE_PIXELS
+-   Use the IMREAD_UNCHANGED flag to preserve the floating-point values from PFM images.
+-   By default, the number of pixels must be less than 2^30. This limit can be changed by setting
+    the environment variable `OPENCV_IO_MAX_IMAGE_PIXELS`. See @ref tutorial_env_reference.
+
+@param filename Name of the file to be loaded.
+@param flags Flag that can take values of `cv::ImreadModes`.
+*/
+CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR_BGR );
 
+/** @brief Loads an image from a file.
+
+This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts and the return value.
 @param filename Name of file to be loaded.
+@param dst object in which the image will be loaded.
 @param flags Flag that can take values of cv::ImreadModes
-*/
-CV_EXPORTS_W Mat imread( const String& filename, int flags = IMREAD_COLOR );
+@note
+The image passing through the img parameter can be pre-allocated. The memory is reused if the shape and the type match with the load image.
+ */
+CV_EXPORTS_W void imread( const String& filename, OutputArray dst, int flags = IMREAD_COLOR_BGR );
 
 /** @brief Loads a multi-page image from a file.
 
@@ -215,7 +345,7 @@ The function imreadmulti loads a multi-page image from the specified file into a
 */
 CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int flags = IMREAD_ANYCOLOR);
 
-/** @brief Loads a of images of a multi-page image from a file.
+/** @brief Loads images of a multi-page image from a file.
 
 The function imreadmulti loads a specified range from a multi-page image from the specified file into a vector of Mat objects.
 @param filename Name of file to be loaded.
@@ -227,29 +357,77 @@ The function imreadmulti loads a specified range from a multi-page image from th
 */
 CV_EXPORTS_W bool imreadmulti(const String& filename, CV_OUT std::vector<Mat>& mats, int start, int count, int flags = IMREAD_ANYCOLOR);
 
-/** @brief Returns the number of images inside the give file
+/** @example samples/cpp/tutorial_code/imgcodecs/animations.cpp
+An example to show usage of cv::imreadanimation and cv::imwriteanimation functions.
+Check @ref tutorial_animations "the corresponding tutorial" for more details
+*/
+
+/** @brief Loads frames from an animated image file into an Animation structure.
+
+The function imreadanimation loads frames from an animated image file (e.g., GIF, AVIF, APNG, WEBP) into the provided Animation struct.
+
+@param filename A string containing the path to the file.
+@param animation A reference to an Animation structure where the loaded frames will be stored. It should be initialized before the function is called.
+@param start The index of the first frame to load. This is optional and defaults to 0.
+@param count The number of frames to load. This is optional and defaults to 32767.
+
+@return Returns true if the file was successfully loaded and frames were extracted; returns false otherwise.
+*/
+CV_EXPORTS_W bool imreadanimation(const String& filename, CV_OUT Animation& animation, int start = 0, int count = INT16_MAX);
+
+/** @brief Saves an Animation to a specified file.
+
+The function imwriteanimation saves the provided Animation data to the specified file in an animated format.
+Supported formats depend on the implementation and may include formats like GIF, AVIF, APNG, or WEBP.
 
-The function imcount will return the number of pages in a multi-page image, or 1 for single-page images
+@param filename The name of the file where the animation will be saved. The file extension determines the format.
+@param animation A constant reference to an Animation struct containing the frames and metadata to be saved.
+@param params Optional format-specific parameters encoded as pairs (paramId_1, paramValue_1, paramId_2, paramValue_2, ...).
+These parameters are used to specify additional options for the encoding process. Refer to `cv::ImwriteFlags` for details on possible parameters.
+
+@return Returns true if the animation was successfully saved; returns false otherwise.
+*/
+CV_EXPORTS_W bool imwriteanimation(const String& filename, const Animation& animation, const std::vector<int>& params = std::vector<int>());
+
+/** @brief Returns the number of images inside the given file
+
+The function imcount returns the number of pages in a multi-page image (e.g. TIFF), the number of frames in an animation (e.g. AVIF), and 1 otherwise.
+If the image cannot be decoded, 0 is returned.
 @param filename Name of file to be loaded.
 @param flags Flag that can take values of cv::ImreadModes, default with cv::IMREAD_ANYCOLOR.
+@todo when cv::IMREAD_LOAD_GDAL flag used the return value will be 0 or 1 because OpenCV's GDAL decoder doesn't support multi-page reading yet.
 */
 CV_EXPORTS_W size_t imcount(const String& filename, int flags = IMREAD_ANYCOLOR);
 
 /** @brief Saves an image to a specified file.
 
 The function imwrite saves the image to the specified file. The image format is chosen based on the
-filename extension (see cv::imread for the list of extensions). In general, only 8-bit
+filename extension (see cv::imread for the list of extensions). In general, only 8-bit unsigned (CV_8U)
 single-channel or 3-channel (with 'BGR' channel order) images
 can be saved using this function, with these exceptions:
 
-- 16-bit unsigned (CV_16U) images can be saved in the case of PNG, JPEG 2000, and TIFF formats
-- 32-bit float (CV_32F) images can be saved in PFM, TIFF, OpenEXR, and Radiance HDR formats;
-  3-channel (CV_32FC3) TIFF images will be saved using the LogLuv high dynamic range encoding
-  (4 bytes per pixel)
-- PNG images with an alpha channel can be saved using this function. To do this, create
-8-bit (or 16-bit) 4-channel image BGRA, where the alpha channel goes last. Fully transparent pixels
-should have alpha set to 0, fully opaque pixels should have alpha set to 255/65535 (see the code sample below).
-- Multiple images (vector of Mat) can be saved in TIFF format (see the code sample below).
+- With OpenEXR encoder, only 32-bit float (CV_32F) images can be saved.
+  - 8-bit unsigned (CV_8U) images are not supported.
+- With Radiance HDR encoder, non 64-bit float (CV_64F) images can be saved.
+  - All images will be converted to 32-bit float (CV_32F).
+- With JPEG 2000 encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With JPEG XL encoder, 8-bit unsigned (CV_8U), 16-bit unsigned (CV_16U) and 32-bit float(CV_32F) images can be saved.
+  - JPEG XL images with an alpha channel can be saved using this function.
+    To do this, create 8-bit (or 16-bit, 32-bit float) 4-channel image BGRA, where the alpha channel goes last.
+    Fully transparent pixels should have alpha set to 0, fully opaque pixels should have alpha set to 255/65535/1.0.
+- With PAM encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With PNG encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+  - PNG images with an alpha channel can be saved using this function. To do this, create
+    8-bit (or 16-bit) 4-channel image BGRA, where the alpha channel goes last. Fully transparent pixels
+    should have alpha set to 0, fully opaque pixels should have alpha set to 255/65535 (see the code sample below).
+- With PGM/PPM encoder, 8-bit unsigned (CV_8U) and 16-bit unsigned (CV_16U) images can be saved.
+- With TIFF encoder, 8-bit unsigned (CV_8U), 8-bit signed (CV_8S),
+                     16-bit unsigned (CV_16U), 16-bit signed (CV_16S),
+                     32-bit signed (CV_32S),
+                     32-bit float (CV_32F) and 64-bit float (CV_64F) images can be saved.
+  - Multiple images (vector of Mat) can be saved in TIFF format (see the code sample below).
+  - 32-bit float 3-channel (CV_32FC3) TIFF images will be saved
+    using the LogLuv high dynamic range encoding (4 bytes per pixel)
 
 If the image format is not supported, the image will be converted to 8-bit unsigned (CV_8U) and saved that way.
 
@@ -267,7 +445,7 @@ It also demonstrates how to save multiple images in a TIFF file:
 CV_EXPORTS_W bool imwrite( const String& filename, InputArray img,
               const std::vector<int>& params = std::vector<int>());
 
-/// @overload multi-image overload for bindings
+//! @brief multi-image overload for bindings
 CV_WRAP static inline
 bool imwritemulti(const String& filename, InputArrayOfArrays img,
                   const std::vector<int>& params = std::vector<int>())
@@ -289,20 +467,36 @@ See cv::imread for the list of supported formats and flags description.
 CV_EXPORTS_W Mat imdecode( InputArray buf, int flags );
 
 /** @overload
-@param buf
-@param flags
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
 @param dst The optional output placeholder for the decoded matrix. It can save the image
-reallocations when the function is called repeatedly for images of the same size.
+reallocations when the function is called repeatedly for images of the same size. In case of decoder
+failure the function returns empty cv::Mat object, but does not release user-provided dst buffer.
 */
 CV_EXPORTS Mat imdecode( InputArray buf, int flags, Mat* dst);
 
+/** @brief Reads a multi-page image from a buffer in memory.
+
+The function imdecodemulti reads a multi-page image from the specified buffer in the memory. If the buffer is too short or
+contains invalid data, the function returns false.
+
+See cv::imreadmulti for the list of supported formats and flags description.
+
+@note In the case of color images, the decoded images will have the channels stored in **B G R** order.
+@param buf Input array or vector of bytes.
+@param flags The same flags as in cv::imread, see cv::ImreadModes.
+@param mats A vector of Mat objects holding each page, if more than one.
+@param range A continuous selection of pages.
+*/
+CV_EXPORTS_W bool imdecodemulti(InputArray buf, int flags, CV_OUT std::vector<Mat>& mats, const cv::Range& range = Range::all());
+
 /** @brief Encodes an image into a memory buffer.
 
 The function imencode compresses the image and stores it in the memory buffer that is resized to fit the
 result. See cv::imwrite for the list of supported formats and flags description.
 
 @param ext File extension that defines the output format. Must include a leading period.
-@param img Image to be written.
+@param img Image to be compressed.
 @param buf Output buffer resized to fit the compressed image.
 @param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.
 */
@@ -310,18 +504,97 @@ CV_EXPORTS_W bool imencode( const String& ext, InputArray img,
                             CV_OUT std::vector<uchar>& buf,
                             const std::vector<int>& params = std::vector<int>());
 
-/** @brief Returns true if the specified image can be decoded by OpenCV
+/** @brief Encodes array of images into a memory buffer.
+
+The function is analog to cv::imencode for in-memory multi-page image compression.
+See cv::imwrite for the list of supported formats and flags description.
 
-@param filename File name of the image
+@param ext File extension that defines the output format. Must include a leading period.
+@param imgs Vector of images to be written.
+@param buf Output buffer resized to fit the compressed data.
+@param params Format-specific parameters. See cv::imwrite and cv::ImwriteFlags.
+*/
+CV_EXPORTS_W bool imencodemulti( const String& ext, InputArrayOfArrays imgs,
+                                 CV_OUT std::vector<uchar>& buf,
+                                 const std::vector<int>& params = std::vector<int>());
+
+/** @brief Checks if the specified image file can be decoded by OpenCV.
+
+The function haveImageReader checks if OpenCV is capable of reading the specified file.
+This can be useful for verifying support for a given image format before attempting to load an image.
+
+@param filename The name of the file to be checked.
+@return true if an image reader for the specified file is available and the file can be opened, false otherwise.
+
+@note The function checks the availability of image codecs that are either built into OpenCV or dynamically loaded.
+It does not check for the actual existence of the file but rather the ability to read the specified file type.
+If the file cannot be opened or the format is unsupported, the function will return false.
+
+@sa cv::haveImageWriter, cv::imread, cv::imdecode
 */
 CV_EXPORTS_W bool haveImageReader( const String& filename );
 
-/** @brief Returns true if an image with the specified filename can be encoded by OpenCV
+/** @brief Checks if the specified image file or specified file extension can be encoded by OpenCV.
 
- @param filename File name of the image
- */
+The function haveImageWriter checks if OpenCV is capable of writing images with the specified file extension.
+This can be useful for verifying support for a given image format before attempting to save an image.
+
+@param filename The name of the file or the file extension (e.g., ".jpg", ".png").
+It is recommended to provide the file extension rather than the full file name.
+@return true if an image writer for the specified extension is available, false otherwise.
+
+@note The function checks the availability of image codecs that are either built into OpenCV or dynamically loaded.
+It does not check for the actual existence of the file but rather the ability to write files of the given type.
+
+@sa cv::haveImageReader, cv::imwrite, cv::imencode
+*/
 CV_EXPORTS_W bool haveImageWriter( const String& filename );
 
+/** @brief To read multi-page images on demand
+
+The ImageCollection class provides iterator API to read multi-page images on demand. Create iterator
+to the collection of the images and iterate over the collection. Decode the necessary page with operator*.
+
+The performance of page decoding is O(1) if collection is increment sequentially. If the user wants to access random page,
+then the time Complexity is O(n) because the collection has to be reinitialized every time in order to go to the correct page.
+However, the intermediate pages are not decoded during the process, so typically it's quite fast.
+This is required because multi-page codecs does not support going backwards.
+After decoding the one page, it is stored inside the collection cache. Hence, trying to get Mat object from already decoded page is O(1).
+If you need memory, you can use .releaseCache() method to release cached index.
+The space complexity is O(n) if all pages are decoded into memory. The user is able to decode and release images on demand.
+*/
+class CV_EXPORTS ImageCollection {
+public:
+    struct CV_EXPORTS iterator {
+        iterator(ImageCollection* col);
+        iterator(ImageCollection* col, int end);
+        Mat& operator*();
+        Mat* operator->();
+        iterator& operator++();
+        iterator operator++(int);
+        friend bool operator== (const iterator& a, const iterator& b) { return a.m_curr == b.m_curr; }
+        friend bool operator!= (const iterator& a, const iterator& b) { return a.m_curr != b.m_curr; }
+
+    private:
+        ImageCollection* m_pCollection;
+        int m_curr;
+    };
+
+    ImageCollection();
+    ImageCollection(const String& filename, int flags);
+    void init(const String& img, int flags);
+    size_t size() const;
+    const Mat& at(int index);
+    const Mat& operator[](int index);
+    void releaseCache(int index);
+    iterator begin();
+    iterator end();
+
+    class Impl;
+    Ptr<Impl> getImpl();
+protected:
+    Ptr<Impl> pImpl;
+};
 
 //! @} imgcodecs
 
diff --git a/3rdParty/opencv2/imgproc.hpp b/3rdParty/opencv2/imgproc.hpp
index ead1274704..4f8fe77d26 100644
--- a/3rdParty/opencv2/imgproc.hpp
+++ b/3rdParty/opencv2/imgproc.hpp
@@ -46,139 +46,143 @@
 #include "opencv2/core.hpp"
 
 /**
-  @defgroup imgproc Image Processing
+@defgroup imgproc Image Processing
 
-This module includes image-processing functions.
+This module offers a comprehensive suite of image processing functions, enabling tasks such as those listed above.
 
-  @{
+@{
     @defgroup imgproc_filter Image Filtering
 
-Functions and classes described in this section are used to perform various linear or non-linear
-filtering operations on 2D images (represented as Mat's). It means that for each pixel location
-\f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
-compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
-morphological operations, it is the minimum or maximum values, and so on. The computed response is
-stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
-will be of the same size as the input image. Normally, the functions support multi-channel arrays,
-in which case every channel is processed independently. Therefore, the output image will also have
-the same number of channels as the input one.
-
-Another common feature of the functions and classes described in this section is that, unlike
-simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
-example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
-processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
-of the image. You can let these pixels be the same as the left-most image pixels ("replicated
-border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
-border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
-For details, see #BorderTypes
-
-@anchor filter_depths
-### Depth combinations
-Input depth (src.depth()) | Output depth (ddepth)
---------------------------|----------------------
-CV_8U                     | -1/CV_16S/CV_32F/CV_64F
-CV_16U/CV_16S             | -1/CV_32F/CV_64F
-CV_32F                    | -1/CV_32F/CV_64F
-CV_64F                    | -1/CV_64F
-
-@note when ddepth=-1, the output image will have the same depth as the source.
+    Functions and classes described in this section are used to perform various linear or non-linear
+    filtering operations on 2D images (represented as Mat's). It means that for each pixel location
+    \f$(x,y)\f$ in the source image (normally, rectangular), its neighborhood is considered and used to
+    compute the response. In case of a linear filter, it is a weighted sum of pixel values. In case of
+    morphological operations, it is the minimum or maximum values, and so on. The computed response is
+    stored in the destination image at the same location \f$(x,y)\f$. It means that the output image
+    will be of the same size as the input image. Normally, the functions support multi-channel arrays,
+    in which case every channel is processed independently. Therefore, the output image will also have
+    the same number of channels as the input one.
+
+    Another common feature of the functions and classes described in this section is that, unlike
+    simple arithmetic functions, they need to extrapolate values of some non-existing pixels. For
+    example, if you want to smooth an image using a Gaussian \f$3 \times 3\f$ filter, then, when
+    processing the left-most pixels in each row, you need pixels to the left of them, that is, outside
+    of the image. You can let these pixels be the same as the left-most image pixels ("replicated
+    border" extrapolation method), or assume that all the non-existing pixels are zeros ("constant
+    border" extrapolation method), and so on. OpenCV enables you to specify the extrapolation method.
+    For details, see #BorderTypes
+
+    @anchor filter_depths
+    ### Depth combinations
+    Input depth (src.depth()) | Output depth (ddepth)
+    --------------------------|----------------------
+    CV_8U                     | -1/CV_16S/CV_32F/CV_64F
+    CV_16U/CV_16S             | -1/CV_32F/CV_64F
+    CV_32F                    | -1/CV_32F
+    CV_64F                    | -1/CV_64F
+
+    @note when ddepth=-1, the output image will have the same depth as the source.
+
+    @note if you need double floating-point accuracy and using single floating-point input data
+    (CV_32F input and CV_64F output depth combination), you can use @ref Mat.convertTo to convert
+    the input data to the desired precision.
 
     @defgroup imgproc_transform Geometric Image Transformations
 
-The functions in this section perform various geometrical transformations of 2D images. They do not
-change the image content but deform the pixel grid and map this deformed grid to the destination
-image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
-destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
-functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
-pixel value:
-
-\f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
-
-In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
-\texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
-\f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
-
-The actual implementations of the geometrical transformations, from the most generic remap and to
-the simplest and the fastest resize, need to solve two main problems with the above formula:
-
-- Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
-previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
-of them may fall outside of the image. In this case, an extrapolation method needs to be used.
-OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
-addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
-the destination image will not be modified at all.
-
-- Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
-numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
-transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
-coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
-nearest integer coordinates and the corresponding pixel can be used. This is called a
-nearest-neighbor interpolation. However, a better result can be achieved by using more
-sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
-where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
-f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
-interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
-#resize for details.
-
-@note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
+    The functions in this section perform various geometrical transformations of 2D images. They do not
+    change the image content but deform the pixel grid and map this deformed grid to the destination
+    image. In fact, to avoid sampling artifacts, the mapping is done in the reverse order, from
+    destination to the source. That is, for each pixel \f$(x, y)\f$ of the destination image, the
+    functions compute coordinates of the corresponding "donor" pixel in the source image and copy the
+    pixel value:
+
+    \f[\texttt{dst} (x,y)= \texttt{src} (f_x(x,y), f_y(x,y))\f]
+
+    In case when you specify the forward mapping \f$\left<g_x, g_y\right>: \texttt{src} \rightarrow
+    \texttt{dst}\f$, the OpenCV functions first compute the corresponding inverse mapping
+    \f$\left<f_x, f_y\right>: \texttt{dst} \rightarrow \texttt{src}\f$ and then use the above formula.
+
+    The actual implementations of the geometrical transformations, from the most generic remap and to
+    the simplest and the fastest resize, need to solve two main problems with the above formula:
+
+    - Extrapolation of non-existing pixels. Similarly to the filtering functions described in the
+    previous section, for some \f$(x,y)\f$, either one of \f$f_x(x,y)\f$, or \f$f_y(x,y)\f$, or both
+    of them may fall outside of the image. In this case, an extrapolation method needs to be used.
+    OpenCV provides the same selection of extrapolation methods as in the filtering functions. In
+    addition, it provides the method #BORDER_TRANSPARENT. This means that the corresponding pixels in
+    the destination image will not be modified at all.
+
+    - Interpolation of pixel values. Usually \f$f_x(x,y)\f$ and \f$f_y(x,y)\f$ are floating-point
+    numbers. This means that \f$\left<f_x, f_y\right>\f$ can be either an affine or perspective
+    transformation, or radial lens distortion correction, and so on. So, a pixel value at fractional
+    coordinates needs to be retrieved. In the simplest case, the coordinates can be just rounded to the
+    nearest integer coordinates and the corresponding pixel can be used. This is called a
+    nearest-neighbor interpolation. However, a better result can be achieved by using more
+    sophisticated [interpolation methods](http://en.wikipedia.org/wiki/Multivariate_interpolation) ,
+    where a polynomial function is fit into some neighborhood of the computed pixel \f$(f_x(x,y),
+    f_y(x,y))\f$, and then the value of the polynomial at \f$(f_x(x,y), f_y(x,y))\f$ is taken as the
+    interpolated pixel value. In OpenCV, you can choose between several interpolation methods. See
+    #resize for details.
+
+    @note The geometrical transformations do not work with `CV_8S` or `CV_32S` images.
 
     @defgroup imgproc_misc Miscellaneous Image Transformations
     @defgroup imgproc_draw Drawing Functions
 
-Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
-rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
-the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
-for color images and brightness for grayscale images. For color images, the channel ordering is
-normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
-color using the Scalar constructor, it should look like:
+    Drawing functions work with matrices/images of arbitrary depth. The boundaries of the shapes can be
+    rendered with antialiasing (implemented only for 8-bit images for now). All the functions include
+    the parameter color that uses an RGB value (that may be constructed with the Scalar constructor )
+    for color images and brightness for grayscale images. For color images, the channel ordering is
+    normally *Blue, Green, Red*. This is what imshow, imread, and imwrite expect. So, if you form a
+    color using the Scalar constructor, it should look like:
 
-\f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
+    \f[\texttt{Scalar} (blue \_ component, green \_ component, red \_ component[, alpha \_ component])\f]
 
-If you are using your own image rendering and I/O functions, you can use any channel ordering. The
-drawing functions process each channel independently and do not depend on the channel order or even
-on the used color space. The whole image can be converted from BGR to RGB or to a different color
-space using cvtColor .
+    If you are using your own image rendering and I/O functions, you can use any channel ordering. The
+    drawing functions process each channel independently and do not depend on the channel order or even
+    on the used color space. The whole image can be converted from BGR to RGB or to a different color
+    space using cvtColor .
 
-If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
-many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
-that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
-fractional bits is specified by the shift parameter and the real point coordinates are calculated as
-\f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
-especially effective when rendering antialiased shapes.
+    If a drawn figure is partially or completely outside the image, the drawing functions clip it. Also,
+    many drawing functions can handle pixel coordinates specified with sub-pixel accuracy. This means
+    that the coordinates can be passed as fixed-point numbers encoded as integers. The number of
+    fractional bits is specified by the shift parameter and the real point coordinates are calculated as
+    \f$\texttt{Point}(x,y)\rightarrow\texttt{Point2f}(x*2^{-shift},y*2^{-shift})\f$ . This feature is
+    especially effective when rendering antialiased shapes.
 
-@note The functions do not support alpha-transparency when the target image is 4-channel. In this
-case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
-semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
-image.
+    @note The functions do not support alpha-transparency when the target image is 4-channel. In this
+    case, the color[3] is simply copied to the repainted pixels. Thus, if you want to paint
+    semi-transparent shapes, you can paint them in a separate buffer and then blend it with the main
+    image.
 
     @defgroup imgproc_color_conversions Color Space Conversions
     @defgroup imgproc_colormap ColorMaps in OpenCV
 
-The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
-sensitive to observing changes between colors, so you often need to recolor your grayscale images to
-get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
-computer vision application.
+    The human perception isn't built for observing fine changes in grayscale images. Human eyes are more
+    sensitive to observing changes between colors, so you often need to recolor your grayscale images to
+    get a clue about them. OpenCV now comes with various colormaps to enhance the visualization in your
+    computer vision application.
 
-In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
-code reads the path to an image from command line, applies a Jet colormap on it and shows the
-result:
+    In OpenCV you only need applyColorMap to apply a colormap on a given image. The following sample
+    code reads the path to an image from command line, applies a Jet colormap on it and shows the
+    result:
 
-@include snippets/imgproc_applyColorMap.cpp
+    @include snippets/imgproc_applyColorMap.cpp
 
-@see #ColormapTypes
+    @see #ColormapTypes
 
     @defgroup imgproc_subdiv2d Planar Subdivision
 
-The Subdiv2D class described in this section is used to perform various planar subdivision on
-a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
-using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
-In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
-diagram with red lines.
+    The Subdiv2D class described in this section is used to perform various planar subdivision on
+    a set of 2D points (represented as vector of Point2f). OpenCV subdivides a plane into triangles
+    using the Delaunay's algorithm, which corresponds to the dual graph of the Voronoi diagram.
+    In the figure below, the Delaunay's triangulation is marked with black lines and the Voronoi
+    diagram with red lines.
 
-![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
+    ![Delaunay triangulation (black) and Voronoi (red)](pics/delaunay_voronoi.png)
 
-The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
-location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
+    The subdivisions can be used for the 3D piece-wise transformation of a plane, morphing, fast
+    location of points on the plane, building special graphs (such as NNG,RNG), and so forth.
 
     @defgroup imgproc_hist Histograms
     @defgroup imgproc_shape Structural Analysis and Shape Descriptors
@@ -186,7 +190,6 @@ location of points on the plane, building special graphs (such as NNG,RNG), and
     @defgroup imgproc_feature Feature Detection
     @defgroup imgproc_object Object Detection
     @defgroup imgproc_segmentation Image Segmentation
-    @defgroup imgproc_c C API
     @defgroup imgproc_hal Hardware Acceleration Layer
     @{
         @defgroup imgproc_hal_functions Functions
@@ -233,7 +236,7 @@ enum MorphShapes {
     MORPH_CROSS   = 1, //!< a cross-shaped structuring element:
                        //!< \f[E_{ij} = \begin{cases} 1 & \texttt{if } {i=\texttt{anchor.y } {or } {j=\texttt{anchor.x}}} \\0 & \texttt{otherwise} \end{cases}\f]
     MORPH_ELLIPSE = 2 //!< an elliptic structuring element, that is, a filled ellipse inscribed
-                      //!< into the rectangle Rect(0, 0, esize.width, 0.esize.height)
+                      //!< into the rectangle Rect(0, 0, esize.width, esize.height)
 };
 
 //! @} imgproc_filter
@@ -271,7 +274,8 @@ enum InterpolationFlags{
     - flag is __not__ set: \f$dst( \rho , \phi ) = src(x,y)\f$
     - flag is set: \f$dst(x,y) = src( \rho , \phi )\f$
     */
-    WARP_INVERSE_MAP     = 16
+    WARP_INVERSE_MAP     = 16,
+    WARP_RELATIVE_MAP    = 32
 };
 
 /** \brief Specify the polar mapping mode
@@ -637,112 +641,109 @@ enum ColorConversionCodes {
     COLOR_YUV2BGR      = 84,
     COLOR_YUV2RGB      = 85,
 
-    //! YUV 4:2:0 family to RGB
-    COLOR_YUV2RGB_NV12  = 90,
-    COLOR_YUV2BGR_NV12  = 91,
-    COLOR_YUV2RGB_NV21  = 92,
-    COLOR_YUV2BGR_NV21  = 93,
-    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21,
-    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21,
-
-    COLOR_YUV2RGBA_NV12 = 94,
-    COLOR_YUV2BGRA_NV12 = 95,
-    COLOR_YUV2RGBA_NV21 = 96,
-    COLOR_YUV2BGRA_NV21 = 97,
-    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21,
-    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21,
-
-    COLOR_YUV2RGB_YV12  = 98,
-    COLOR_YUV2BGR_YV12  = 99,
-    COLOR_YUV2RGB_IYUV  = 100,
-    COLOR_YUV2BGR_IYUV  = 101,
-    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV,
-    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV,
-    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12,
-    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12,
-
-    COLOR_YUV2RGBA_YV12 = 102,
-    COLOR_YUV2BGRA_YV12 = 103,
-    COLOR_YUV2RGBA_IYUV = 104,
-    COLOR_YUV2BGRA_IYUV = 105,
-    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV,
-    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV,
-    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12,
-    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12,
-
-    COLOR_YUV2GRAY_420  = 106,
-    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420,
-    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420,
-    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420,
-    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420,
-
-    //! YUV 4:2:2 family to RGB
-    COLOR_YUV2RGB_UYVY = 107,
-    COLOR_YUV2BGR_UYVY = 108,
-    //COLOR_YUV2RGB_VYUY = 109,
-    //COLOR_YUV2BGR_VYUY = 110,
-    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY,
-    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY,
-    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY,
-    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY,
-
-    COLOR_YUV2RGBA_UYVY = 111,
-    COLOR_YUV2BGRA_UYVY = 112,
-    //COLOR_YUV2RGBA_VYUY = 113,
-    //COLOR_YUV2BGRA_VYUY = 114,
-    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY,
-    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY,
-    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY,
-    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY,
-
-    COLOR_YUV2RGB_YUY2 = 115,
-    COLOR_YUV2BGR_YUY2 = 116,
-    COLOR_YUV2RGB_YVYU = 117,
-    COLOR_YUV2BGR_YVYU = 118,
-    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2,
-    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2,
-    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2,
-    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2,
-
-    COLOR_YUV2RGBA_YUY2 = 119,
-    COLOR_YUV2BGRA_YUY2 = 120,
-    COLOR_YUV2RGBA_YVYU = 121,
-    COLOR_YUV2BGRA_YVYU = 122,
-    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2,
-    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2,
-    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2,
-    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2,
-
-    COLOR_YUV2GRAY_UYVY = 123,
-    COLOR_YUV2GRAY_YUY2 = 124,
-    //CV_YUV2GRAY_VYUY    = CV_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY,
-    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2,
-    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2,
-    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2,
+    COLOR_YUV2RGB_NV12  = 90, //!< convert between 4:2:0-subsampled YUV NV12 and RGB, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV12  = 91, //!< convert between 4:2:0-subsampled YUV NV12 and BGR, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_NV21  = 92, //!< convert between 4:2:0-subsampled YUV NV21 and RGB, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_NV21  = 93, //!< convert between 4:2:0-subsampled YUV NV21 and BGR, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGB  = COLOR_YUV2RGB_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGR  = COLOR_YUV2BGR_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGBA_NV12 = 94, //!< convert between 4:2:0-subsampled YUV NV12 and RGBA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV12 = 95, //!< convert between 4:2:0-subsampled YUV NV12 and BGRA, two planes (in one or separate arrays): Y and U/V interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_NV21 = 96, //!< convert between 4:2:0-subsampled YUV NV21 and RGBA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_NV21 = 97, //!< convert between 4:2:0-subsampled YUV NV21 and BGRA, two planes (in one or separate arrays): Y and V/U interleaved, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV420sp2RGBA = COLOR_YUV2RGBA_NV21, //!< synonym to NV21
+    COLOR_YUV420sp2BGRA = COLOR_YUV2BGRA_NV21, //!< synonym to NV21
+
+    COLOR_YUV2RGB_YV12  =  98, //!< convert between 4:2:0-subsampled YUV YV12 and RGB, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YV12  =  99, //!< convert between 4:2:0-subsampled YUV YV12 and BGR, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_IYUV  = 100, //!< convert between 4:2:0-subsampled YUV IYUV and RGB, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_IYUV  = 101, //!< convert between 4:2:0-subsampled YUV IYUV and BGR, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_I420  = COLOR_YUV2RGB_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGR_I420  = COLOR_YUV2BGR_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGB   = COLOR_YUV2RGB_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGR   = COLOR_YUV2BGR_YV12, //!< synonym to YV12
+
+    COLOR_YUV2RGBA_YV12 = 102, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YV12 = 103, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_IYUV = 104, //!< convert between 4:2:0-subsampled YUV YV12 and RGBA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_IYUV = 105, //!< convert between 4:2:0-subsampled YUV YV12 and BGRA, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_I420 = COLOR_YUV2RGBA_IYUV, //!< synonym to IYUV
+    COLOR_YUV2BGRA_I420 = COLOR_YUV2BGRA_IYUV, //!< synonym to IYUV
+    COLOR_YUV420p2RGBA  = COLOR_YUV2RGBA_YV12, //!< synonym to YV12
+    COLOR_YUV420p2BGRA  = COLOR_YUV2BGRA_YV12, //!< synonym to YV12
+
+    COLOR_YUV2GRAY_420  = 106, //!< extract Y channel from YUV 4:2:0 image
+    COLOR_YUV2GRAY_NV21 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_NV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_YV12 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_IYUV = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV2GRAY_I420 = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420sp2GRAY = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+    COLOR_YUV420p2GRAY  = COLOR_YUV2GRAY_420, //!< synonym to COLOR_YUV2GRAY_420
+
+    COLOR_YUV2RGB_UYVY = 107, //!< convert between YUV UYVY and RGB, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_UYVY = 108, //!< convert between YUV UYVY and BGR, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGB_VYUY = 109, //!< convert between YUV VYUY and RGB, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGR_VYUY = 110, //!< convert between YUV VYUY and BGR, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_Y422 = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_Y422 = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGB_UYNV = COLOR_YUV2RGB_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGR_UYNV = COLOR_YUV2BGR_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGBA_UYVY = 111, //!< convert between YUV UYVY and RGBA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_UYVY = 112, //!< convert between YUV UYVY and BGRA, YUV is 4:2:2-subsampled and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2RGBA_VYUY = 113, //!< convert between YUV VYUY and RGBA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    //COLOR_YUV2BGRA_VYUY = 114, //!< convert between YUV VYUY and BGRA, YUV is 4:2:2-subsampled and interleaved as V/Y1/U/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_Y422 = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_Y422 = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2RGBA_UYNV = COLOR_YUV2RGBA_UYVY, //!< synonym to UYVY
+    COLOR_YUV2BGRA_UYNV = COLOR_YUV2BGRA_UYVY, //!< synonym to UYVY
+
+    COLOR_YUV2RGB_YUY2 = 115, //!< convert between YUV YUY2 and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YUY2 = 116, //!< convert between YUV YUY2 and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YVYU = 117, //!< convert between YUV YVYU and RGB, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGR_YVYU = 118, //!< convert between YUV YVYU and BGR, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGB_YUYV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUYV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGB_YUNV = COLOR_YUV2RGB_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGR_YUNV = COLOR_YUV2BGR_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2RGBA_YUY2 = 119, //!< convert between YUV YUY2 and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YUY2 = 120, //!< convert between YUV YUY2 and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YVYU = 121, //!< convert between YUV YVYU and RGBA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2BGRA_YVYU = 122, //!< convert between YUV YVYU and BGRA, YUV is 4:2:2-subsampled and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_YUV2RGBA_YUYV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUYV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2RGBA_YUNV = COLOR_YUV2RGBA_YUY2, //!< synonym to YUY2
+    COLOR_YUV2BGRA_YUNV = COLOR_YUV2BGRA_YUY2, //!< synonym to YUY2
+
+    COLOR_YUV2GRAY_UYVY = 123, //!< extract Y channel from YUV 4:2:2 image
+    COLOR_YUV2GRAY_YUY2 = 124, //!< extract Y channel from YUV 4:2:2 image
+    //CV_YUV2GRAY_VYUY  = CV_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_Y422 = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_UYNV = COLOR_YUV2GRAY_UYVY, //!< synonym to COLOR_YUV2GRAY_UYVY
+    COLOR_YUV2GRAY_YVYU = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUYV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
+    COLOR_YUV2GRAY_YUNV = COLOR_YUV2GRAY_YUY2, //!< synonym to COLOR_YUV2GRAY_YUY2
 
     //! alpha premultiplication
     COLOR_RGBA2mRGBA    = 125,
     COLOR_mRGBA2RGBA    = 126,
 
-    //! RGB to YUV 4:2:0 family
-    COLOR_RGB2YUV_I420  = 127,
-    COLOR_BGR2YUV_I420  = 128,
-    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420,
-    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420,
-
-    COLOR_RGBA2YUV_I420 = 129,
-    COLOR_BGRA2YUV_I420 = 130,
-    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420,
-    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420,
-    COLOR_RGB2YUV_YV12  = 131,
-    COLOR_BGR2YUV_YV12  = 132,
-    COLOR_RGBA2YUV_YV12 = 133,
-    COLOR_BGRA2YUV_YV12 = 134,
+    COLOR_RGB2YUV_I420  = 127, //!< convert between RGB and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_I420  = 128, //!< convert between BGR and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_IYUV  = COLOR_RGB2YUV_I420, //!< synonym to I420
+    COLOR_BGR2YUV_IYUV  = COLOR_BGR2YUV_I420, //!< synonym to I420
+
+    COLOR_RGBA2YUV_I420 = 129, //!< convert between RGBA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_I420 = 130, //!< convert between BGRA and 4:2:0-subsampled YUV I420, three planes in one array: Y, U and V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_IYUV = COLOR_RGBA2YUV_I420, //!< synonym to I420
+    COLOR_BGRA2YUV_IYUV = COLOR_BGRA2YUV_I420, //!< synonym to I420
+    COLOR_RGB2YUV_YV12  = 131, //!< convert between RGB and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YV12  = 132, //!< convert between BGR and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YV12 = 133, //!< convert between RGBA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YV12 = 134, //!< convert between BGRA and 4:2:0-subsampled YUV YV12, three planes in one array: Y, V and U, see @ref color_convert_rgb_yuv_42x
 
     //! Demosaicing, see @ref color_convert_bayer "color conversions" for additional information
     COLOR_BayerBG2BGR = 46, //!< equivalent to RGGB Bayer pattern
@@ -838,7 +839,39 @@ enum ColorConversionCodes {
     COLOR_BayerRG2RGBA = COLOR_BayerBG2BGRA, //!< equivalent to BGGR Bayer pattern
     COLOR_BayerGR2RGBA = COLOR_BayerGB2BGRA, //!< equivalent to GBRG Bayer pattern
 
-    COLOR_COLORCVT_MAX  = 143
+    COLOR_RGB2YUV_UYVY = 143, //!< convert between RGB and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_UYVY = 144, //!< convert between BGR and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_Y422 = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_Y422 = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGB2YUV_UYNV = COLOR_RGB2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGR2YUV_UYNV = COLOR_BGR2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGBA2YUV_UYVY = 145, //!< convert between RGBA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_UYVY = 146, //!< convert between BGRA and YUV UYVU, YUV is 4:2:2 and interleaved as U/Y1/V/Y2, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_Y422 = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_Y422 = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_RGBA2YUV_UYNV = COLOR_RGBA2YUV_UYVY, //!< synonym to UYVY
+    COLOR_BGRA2YUV_UYNV = COLOR_BGRA2YUV_UYVY, //!< synonym to UYVY
+
+    COLOR_RGB2YUV_YUY2 = 147, //!< convert between RGB and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YUY2 = 148, //!< convert between BGR and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YVYU = 149, //!< convert between RGB and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGR2YUV_YVYU = 150, //!< convert between BGR and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGB2YUV_YUYV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUYV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGB2YUV_YUNV = COLOR_RGB2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGR2YUV_YUNV = COLOR_BGR2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_RGBA2YUV_YUY2 = 151, //!< convert between RGBA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YUY2 = 152, //!< convert between BGRA and YUV YUY2, YUV is 4:2:2 and interleaved as Y1/U/Y2/V, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YVYU = 153, //!< convert between RGBA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_BGRA2YUV_YVYU = 154, //!< convert between BGRA and YUV YVYU, YUV is 4:2:2 and interleaved as Y1/V/Y2/U, see @ref color_convert_rgb_yuv_42x
+    COLOR_RGBA2YUV_YUYV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUYV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_RGBA2YUV_YUNV = COLOR_RGBA2YUV_YUY2, //!< synonym to YUY2
+    COLOR_BGRA2YUV_YUNV = COLOR_BGRA2YUV_YUY2, //!< synonym to YUY2
+
+    COLOR_COLORCVT_MAX  = 155
 };
 
 //! @addtogroup imgproc_shape
@@ -1503,12 +1536,14 @@ respectively (see #getGaussianKernel for details); to fully control the result r
 possible future modifications of all this semantics, it is recommended to specify all of ksize,
 sigmaX, and sigmaY.
 @param borderType pixel extrapolation method, see #BorderTypes. #BORDER_WRAP is not supported.
+@param hint Implementation modfication flags. See #AlgorithmHint
 
 @sa  sepFilter2D, filter2D, blur, boxFilter, bilateralFilter, medianBlur
  */
 CV_EXPORTS_W void GaussianBlur( InputArray src, OutputArray dst, Size ksize,
                                 double sigmaX, double sigmaY = 0,
-                                int borderType = BORDER_DEFAULT );
+                                int borderType = BORDER_DEFAULT,
+                                AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );
 
 /** @brief Applies the bilateral filter to an image.
 
@@ -1616,6 +1651,23 @@ CV_EXPORTS_W void blur( InputArray src, OutputArray dst,
                         Size ksize, Point anchor = Point(-1,-1),
                         int borderType = BORDER_DEFAULT );
 
+/** @brief Blurs an image using the stackBlur.
+
+The function applies and stackBlur to an image.
+stackBlur can generate similar results as Gaussian blur, and the time consumption does not increase with the increase of kernel size.
+It creates a kind of moving stack of colors whilst scanning through the image. Thereby it just has to add one new block of color to the right side
+of the stack and remove the leftmost color. The remaining colors on the topmost layer of the stack are either added on or reduced by one,
+depending on if they are on the right or on the left side of the stack. The only supported borderType is BORDER_REPLICATE.
+Original paper was proposed by Mario Klingemann, which can be found http://underdestruction.com/2004/02/25/stackblur-2004.
+
+@param src input image. The number of channels can be arbitrary, but the depth should be one of
+CV_8U, CV_16U, CV_16S or CV_32F.
+@param dst output image of the same size and type as src.
+@param ksize stack-blurring kernel size. The ksize.width and ksize.height can differ but they both must be
+positive and odd.
+*/
+CV_EXPORTS_W void stackBlur(InputArray src, OutputArray dst, Size ksize);
+
 /** @brief Convolves an image with the kernel.
 
 The function applies an arbitrary linear filter to an image. In-place operation is supported. When
@@ -1792,7 +1844,7 @@ with the following \f$3 \times 3\f$ aperture:
 
 @param src Source image.
 @param dst Destination image of the same size and the same number of channels as src .
-@param ddepth Desired depth of the destination image.
+@param ddepth Desired depth of the destination image, see @ref filter_depths "combinations".
 @param ksize Aperture size used to compute the second-derivative filters. See #getDerivKernels for
 details. The size must be positive and odd.
 @param scale Optional scale factor for the computed Laplacian values. By default, no scaling is
@@ -2095,28 +2147,31 @@ transform.
 
 @param image 8-bit, single-channel binary source image. The image may be modified by the function.
 @param lines Output vector of lines. Each line is represented by a 2 or 3 element vector
-\f$(\rho, \theta)\f$ or \f$(\rho, \theta, \textrm{votes})\f$ . \f$\rho\f$ is the distance from the coordinate origin \f$(0,0)\f$ (top-left corner of
-the image). \f$\theta\f$ is the line rotation angle in radians (
-\f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ).
+\f$(\rho, \theta)\f$ or \f$(\rho, \theta, \textrm{votes})\f$, where \f$\rho\f$ is the distance from
+the coordinate origin \f$(0,0)\f$ (top-left corner of the image), \f$\theta\f$ is the line rotation
+angle in radians ( \f$0 \sim \textrm{vertical line}, \pi/2 \sim \textrm{horizontal line}\f$ ), and
 \f$\textrm{votes}\f$ is the value of accumulator.
 @param rho Distance resolution of the accumulator in pixels.
 @param theta Angle resolution of the accumulator in radians.
-@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
 votes ( \f$>\texttt{threshold}\f$ ).
-@param srn For the multi-scale Hough transform, it is a divisor for the distance resolution rho .
+@param srn For the multi-scale Hough transform, it is a divisor for the distance resolution rho.
 The coarse accumulator distance resolution is rho and the accurate accumulator resolution is
-rho/srn . If both srn=0 and stn=0 , the classical Hough transform is used. Otherwise, both these
+rho/srn. If both srn=0 and stn=0, the classical Hough transform is used. Otherwise, both these
 parameters should be positive.
 @param stn For the multi-scale Hough transform, it is a divisor for the distance resolution theta.
 @param min_theta For standard and multi-scale Hough transform, minimum angle to check for lines.
 Must fall between 0 and max_theta.
-@param max_theta For standard and multi-scale Hough transform, maximum angle to check for lines.
-Must fall between min_theta and CV_PI.
+@param max_theta For standard and multi-scale Hough transform, an upper bound for the angle.
+Must fall between min_theta and CV_PI. The actual maximum angle in the accumulator may be slightly
+less than max_theta, depending on the parameters min_theta and theta.
+@param use_edgeval True if you want to use weighted Hough transform.
  */
 CV_EXPORTS_W void HoughLines( InputArray image, OutputArray lines,
                               double rho, double theta, int threshold,
                               double srn = 0, double stn = 0,
-                              double min_theta = 0, double max_theta = CV_PI );
+                              double min_theta = 0, double max_theta = CV_PI,
+                              bool use_edgeval = false );
 
 /** @brief Finds line segments in a binary image using the probabilistic Hough transform.
 
@@ -2139,7 +2194,7 @@ And this is the output of the above program in case of the probabilistic Hough t
 line segment.
 @param rho Distance resolution of the accumulator in pixels.
 @param theta Angle resolution of the accumulator in radians.
-@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
 votes ( \f$>\texttt{threshold}\f$ ).
 @param minLineLength Minimum line length. Line segments shorter than that are rejected.
 @param maxLineGap Maximum allowed gap between points on the same line to link them.
@@ -2158,13 +2213,14 @@ The function finds lines in a set of points using a modification of the Hough tr
 @param lines Output vector of found lines. Each vector is encoded as a vector<Vec3d> \f$(votes, rho, theta)\f$.
 The larger the value of 'votes', the higher the reliability of the Hough line.
 @param lines_max Max count of Hough lines.
-@param threshold Accumulator threshold parameter. Only those lines are returned that get enough
+@param threshold %Accumulator threshold parameter. Only those lines are returned that get enough
 votes ( \f$>\texttt{threshold}\f$ ).
 @param min_rho Minimum value for \f$\rho\f$ for the accumulator (Note: \f$\rho\f$ can be negative. The absolute value \f$|\rho|\f$ is the distance of a line to the origin.).
 @param max_rho Maximum value for \f$\rho\f$ for the accumulator.
 @param rho_step Distance resolution of the accumulator.
 @param min_theta Minimum angle value of the accumulator in radians.
-@param max_theta Maximum angle value of the accumulator in radians.
+@param max_theta Upper bound for the angle value of the accumulator in radians. The actual maximum
+angle may be slightly less than max_theta, depending on the parameters min_theta and theta_step.
 @param theta_step Angle resolution of the accumulator in radians.
  */
 CV_EXPORTS_W void HoughLinesPointSet( InputArray point, OutputArray lines, int lines_max, int threshold,
@@ -2204,7 +2260,7 @@ too large, some circles may be missed.
 @param param1 First method-specific parameter. In case of #HOUGH_GRADIENT and #HOUGH_GRADIENT_ALT,
 it is the higher threshold of the two passed to the Canny edge detector (the lower one is twice smaller).
 Note that #HOUGH_GRADIENT_ALT uses #Scharr algorithm to compute image derivatives, so the threshold value
-shough normally be higher, such as 300 or normally exposed and contrasty images.
+should normally be higher, such as 300 or normally exposed and contrasty images.
 @param param2 Second method-specific parameter. In case of #HOUGH_GRADIENT, it is the
 accumulator threshold for the circle centers at the detection stage. The smaller it is, the more
 false circles may be detected. Circles, corresponding to the larger accumulator values, will be
@@ -2279,7 +2335,7 @@ case of multi-channel images, each channel is processed independently.
 @param src input image; the number of channels can be arbitrary, but the depth should be one of
 CV_8U, CV_16U, CV_16S, CV_32F or CV_64F.
 @param dst output image of the same size and type as src.
-@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+@param kernel structuring element used for dilation; if element=Mat(), a 3 x 3 rectangular
 structuring element is used. Kernel can be created using #getStructuringElement
 @param anchor position of the anchor within the element; default value (-1, -1) means that the
 anchor is at the element center.
@@ -2434,6 +2490,10 @@ The function remap transforms the source image using the specified map:
 
 \f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
 
+with the WARP_RELATIVE_MAP flag :
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (x+map_x(x,y),y+map_y(x,y))\f]
+
 where values of pixels with non-integer coordinates are computed using one of available
 interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
 in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
@@ -2452,7 +2512,9 @@ representation to fixed-point for speed.
 @param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
 if map1 is (x,y) points), respectively.
 @param interpolation Interpolation method (see #InterpolationFlags). The methods #INTER_AREA
-and #INTER_LINEAR_EXACT are not supported by this function.
+#INTER_LINEAR_EXACT and #INTER_NEAREST_EXACT are not supported by this function.
+The extra flag WARP_RELATIVE_MAP can be ORed to the interpolation method
+(e.g. INTER_LINEAR | WARP_RELATIVE_MAP)
 @param borderMode Pixel extrapolation method (see #BorderTypes). When
 borderMode=#BORDER_TRANSPARENT, it means that the pixels in the destination image that
 corresponds to the "outliers" in the source image are not modified by the function.
@@ -2809,7 +2871,7 @@ It makes possible to do a fast blurring or fast block correlation with a variabl
 example. In case of multi-channel images, sums for each channel are accumulated independently.
 
 As a practical example, the next figure shows the calculation of the integral of a straight
-rectangle Rect(3,3,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
+rectangle Rect(4,4,3,2) and of a tilted rectangle Rect(5,1,2,3) . The selected pixels in the
 original image are shown, as well as the relative pixels in the integral images sum and tilted .
 
 ![integral calculation example](pics/integral.png)
@@ -3174,7 +3236,14 @@ CV_EXPORTS void calcHist( const Mat* images, int nimages,
                           const int* histSize, const float** ranges,
                           bool uniform = true, bool accumulate = false );
 
-/** @overload */
+/** @overload
+
+this variant supports only uniform histograms.
+
+ranges argument is either empty vector or a flattened vector of histSize.size()*2 elements
+(histSize.size() element pairs). The first and second elements of each pair specify the lower and
+upper boundaries.
+*/
 CV_EXPORTS_W void calcHist( InputArrayOfArrays images,
                             const std::vector<int>& channels,
                             InputArray mask, OutputArray hist,
@@ -3217,7 +3286,7 @@ images[0].channels() + images[1].channels()-1, and so on.
 size and depth as images[0] .
 @param ranges Array of arrays of the histogram bin boundaries in each dimension. See #calcHist .
 @param scale Optional scale factor for the output back projection.
-@param uniform Flag indicating whether the histogram is uniform or not (see above).
+@param uniform Flag indicating whether the histogram is uniform or not (see #calcHist).
 
 @sa calcHist, compareHist
  */
@@ -3659,20 +3728,21 @@ floating-point.
 @param code color space conversion code (see #ColorConversionCodes).
 @param dstCn number of channels in the destination image; if the parameter is 0, the number of the
 channels is derived automatically from src and code.
+@param hint Implementation modfication flags. See #AlgorithmHint
 
 @see @ref imgproc_color_conversions
  */
-CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0 );
+CV_EXPORTS_W void cvtColor( InputArray src, OutputArray dst, int code, int dstCn = 0, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );
 
 /** @brief Converts an image from one color space to another where the source image is
 stored in two planes.
 
 This function only supports YUV420 to RGB conversion as of now.
 
-@param src1: 8-bit image (#CV_8U) of the Y plane.
-@param src2: image containing interleaved U/V plane.
-@param dst: output image.
-@param code: Specifies the type of conversion. It can take any of the following values:
+@param src1 8-bit image (#CV_8U) of the Y plane.
+@param src2 image containing interleaved U/V plane.
+@param dst output image.
+@param code Specifies the type of conversion. It can take any of the following values:
 - #COLOR_YUV2BGR_NV12
 - #COLOR_YUV2RGB_NV12
 - #COLOR_YUV2BGRA_NV12
@@ -3681,8 +3751,9 @@ This function only supports YUV420 to RGB conversion as of now.
 - #COLOR_YUV2RGB_NV21
 - #COLOR_YUV2BGRA_NV21
 - #COLOR_YUV2RGBA_NV21
+@param hint Implementation modfication flags. See #AlgorithmHint
 */
-CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code );
+CV_EXPORTS_W void cvtColorTwoPlane( InputArray src1, InputArray src2, OutputArray dst, int code, AlgorithmHint hint = cv::ALGO_HINT_DEFAULT );
 
 /** @brief main function for all demosaicing processes
 
@@ -3726,8 +3797,8 @@ CV_EXPORTS_W void demosaicing(InputArray src, OutputArray dst, int code, int dst
 The function computes moments, up to the 3rd order, of a vector shape or a rasterized shape. The
 results are returned in the structure cv::Moments.
 
-@param array Raster image (single-channel, 8-bit or floating-point 2D array) or an array (
-\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f ).
+@param array Single chanel raster image (CV_8U, CV_16U, CV_16S, CV_32F, CV_64F) or an array (
+\f$1 \times N\f$ or \f$N \times 1\f$ ) of 2D points (Point or Point2f).
 @param binaryImage If it is true, all non-zero image pixels are treated as 1's. The parameter is
 used for images only.
 @returns moments.
@@ -3963,15 +4034,18 @@ CV_EXPORTS_W void findContours( InputArray image, OutputArrayOfArrays contours,
 CV_EXPORTS void findContours( InputArray image, OutputArrayOfArrays contours,
                               int mode, int method, Point offset = Point());
 
-/** @example samples/cpp/squares.cpp
-A program using pyramid scaling, Canny, contours and contour simplification to find
-squares in a list of images (pic1-6.png). Returns sequence of squares detected on the image.
-*/
+//! @brief Find contours using link runs algorithm
+//!
+//! This function implements an algorithm different from cv::findContours:
+//! - doesn't allocate temporary image internally, thus it has reduced memory consumption
+//! - supports CV_8UC1 images only
+//! - outputs 2-level hierarhy only (RETR_CCOMP mode)
+//! - doesn't support approximation change other than CHAIN_APPROX_SIMPLE
+//! In all other aspects this function is compatible with cv::findContours.
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours, OutputArray hierarchy);
 
-/** @example samples/tapi/squares.cpp
-A program using pyramid scaling, Canny, contours and contour simplification to find
-squares in the input image.
-*/
+//! @overload
+CV_EXPORTS_W void findContoursLinkRuns(InputArray image, OutputArrayOfArrays contours);
 
 /** @brief Approximates a polygonal curve(s) with the specified precision.
 
@@ -3990,6 +4064,28 @@ CV_EXPORTS_W void approxPolyDP( InputArray curve,
                                 OutputArray approxCurve,
                                 double epsilon, bool closed );
 
+/** @brief Approximates a polygon with a convex hull with a specified accuracy and number of sides.
+
+The cv::approxPolyN function approximates a polygon with a convex hull
+so that the difference between the contour area of the original contour and the new polygon is minimal.
+It uses a greedy algorithm for contracting two vertices into one in such a way that the additional area is minimal.
+Straight lines formed by each edge of the convex contour are drawn and the areas of the resulting triangles are considered.
+Each vertex will lie either on the original contour or outside it.
+
+The algorithm based on the paper @cite LowIlie2003 .
+
+@param curve Input vector of a 2D points stored in std::vector or Mat, points must be float or integer.
+@param approxCurve Result of the approximation. The type is vector of a 2D point (Point2f or Point) in std::vector or Mat.
+@param nsides The parameter defines the number of sides of the result polygon.
+@param epsilon_percentage defines the percentage of the maximum of additional area.
+If it equals -1, it is not used. Otherwise algorighm stops if additional area is greater than contourArea(_curve) * percentage.
+If additional area exceeds the limit, algorithm returns as many vertices as there were at the moment the limit was exceeded.
+@param ensure_convex If it is true, algorithm creates a convex hull of input contour. Otherwise input vector should be convex.
+ */
+CV_EXPORTS_W void approxPolyN(InputArray curve, OutputArray approxCurve,
+                              int nsides, float epsilon_percentage = -1.0,
+                              bool ensure_convex = true);
+
 /** @brief Calculates a contour perimeter or a curve length.
 
 The function computes a curve length or a closed contour perimeter.
@@ -4056,7 +4152,7 @@ The function finds the four vertices of a rotated rectangle. This function is us
 rectangle. In C++, instead of using this function, you can directly use RotatedRect::points method. Please
 visit the @ref tutorial_bounding_rotated_ellipses "tutorial on Creating Bounding rotated boxes and ellipses for contours" for more information.
 
-@param box The input rotated rectangle. It may be the output of
+@param box The input rotated rectangle. It may be the output of @ref minAreaRect.
 @param points The output array of four vertices of rectangles.
  */
 CV_EXPORTS_W void boxPoints(RotatedRect box, OutputArray points);
@@ -4183,7 +4279,7 @@ Examples of how intersectConvexConvex works
 When false, no intersection is found. If the polygons share a side or the vertex of one polygon lies on an edge
 of the other, they are not considered nested and an intersection will be found regardless of the value of handleNested.
 
-@returns Absolute value of area of intersecting polygon
+@returns Area of intersecting polygon. May be negative, if algorithm has not converged, e.g. non-convex input.
 
 @note intersectConvexConvex doesn't confirm that both polygons are convex and will return invalid results if they aren't.
  */
@@ -4248,7 +4344,7 @@ CV_EXPORTS_W RotatedRect fitEllipseAMS( InputArray points );
 
  The function calculates the ellipse that fits a set of 2D points.
  It returns the rotated rectangle in which the ellipse is inscribed.
- The Direct least square (Direct) method by @cite Fitzgibbon1999 is used.
+ The Direct least square (Direct) method by @cite oy1998NumericallySD is used.
 
  For an ellipse, this basis set is \f$ \chi= \left(x^2, x y, y^2, x, y, 1\right) \f$,
  which is a set of six free coefficients \f$ A^T=\left\{A_{\text{xx}},A_{\text{xy}},A_{\text{yy}},A_x,A_y,A_0\right\} \f$.
@@ -4406,7 +4502,7 @@ An example using applyColorMap function
 
 /** @brief Applies a GNU Octave/MATLAB equivalent colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
 @param dst The result is the colormapped source image. Note: Mat::create is called on dst.
 @param colormap The colormap to apply, see #ColormapTypes
 */
@@ -4414,8 +4510,8 @@ CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, int colormap);
 
 /** @brief Applies a user colormap on a given image.
 
-@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3.
-@param dst The result is the colormapped source image. Note: Mat::create is called on dst.
+@param src The source image, grayscale or colored of type CV_8UC1 or CV_8UC3. If CV_8UC3, then the CV_8UC1 image is generated internally using cv::COLOR_BGR2GRAY.
+@param dst The result is the colormapped source image of the same number of channels as userColor. Note: Mat::create is called on dst.
 @param userColor The colormap to apply of type CV_8UC1 or CV_8UC3 and size 256
 */
 CV_EXPORTS_W void applyColorMap(InputArray src, OutputArray dst, InputArray userColor);
diff --git a/3rdParty/opencv2/imgproc/detail/legacy.hpp b/3rdParty/opencv2/imgproc/detail/legacy.hpp
new file mode 100644
index 0000000000..029d9c90e8
--- /dev/null
+++ b/3rdParty/opencv2/imgproc/detail/legacy.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+#define OPENCV_IMGPROC_DETAIL_LEGACY_HPP
+
+#include "opencv2/imgproc.hpp"
+
+namespace cv {
+
+#ifdef __OPENCV_BUILD
+
+CV_EXPORTS void findContours_legacy(InputArray _image,
+                                    OutputArrayOfArrays _contours,
+                                    OutputArray _hierarchy,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+CV_EXPORTS void findContours_legacy(InputArray image,
+                                    OutputArrayOfArrays contours,
+                                    int mode,
+                                    int method,
+                                    Point offset = Point());
+
+CV_EXPORTS float EMD_legacy( InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               float* lowerBound, OutputArray _flow );
+
+CV_EXPORTS float wrapperEMD_legacy(InputArray _signature1, InputArray _signature2,
+               int distType, InputArray _cost,
+               Ptr<float> lowerBound, OutputArray _flow);
+
+#endif
+
+}  // namespace cv
+
+#endif  // OPENCV_IMGPROC_DETAIL_LEGACY_HPP
diff --git a/3rdParty/opencv2/imgproc/hal/hal.hpp b/3rdParty/opencv2/imgproc/hal/hal.hpp
index 033428f5ae..814b19ea7e 100644
--- a/3rdParty/opencv2/imgproc/hal/hal.hpp
+++ b/3rdParty/opencv2/imgproc/hal/hal.hpp
@@ -3,6 +3,7 @@
 
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/cvstd.hpp"
+#include "opencv2/core/utility.hpp"
 #include "opencv2/core/hal/interface.h"
 
 namespace cv { namespace hal {
@@ -108,11 +109,19 @@ CV_EXPORTS void warpAffine(int src_type,
                            uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                            const double M[6], int interpolation, int borderType, const double borderValue[4]);
 
+CV_EXPORTS void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
+
+CV_EXPORTS void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw);
+
 CV_EXPORTS void warpPerspective(int src_type,
                                const uchar * src_data, size_t src_step, int src_width, int src_height,
                                uchar * dst_data, size_t dst_step, int dst_width, int dst_height,
                                const double M[9], int interpolation, int borderType, const double borderValue[4]);
 
+CV_EXPORTS void warpPerspectiveBlocklineNN(const double *M, short* xy, double X0, double Y0, double W0, int bw);
+
+CV_EXPORTS void warpPerspectiveBlockline(const double *M, short* xy, short* alpha, double X0, double Y0, double W0, int bw);
+
 CV_EXPORTS void cvtBGRtoBGR(const uchar * src_data, size_t src_step,
                             uchar * dst_data, size_t dst_step,
                             int width, int height,
@@ -150,12 +159,14 @@ CV_EXPORTS void cvtGraytoBGR5x5(const uchar * src_data, size_t src_step,
 CV_EXPORTS void cvtBGRtoYUV(const uchar * src_data, size_t src_step,
                             uchar * dst_data, size_t dst_step,
                             int width, int height,
-                            int depth, int scn, bool swapBlue, bool isCbCr);
+                            int depth, int scn, bool swapBlue, bool isCbCr,
+                            AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtYUVtoBGR(const uchar * src_data, size_t src_step,
                             uchar * dst_data, size_t dst_step,
                             int width, int height,
-                            int depth, int dcn, bool swapBlue, bool isCbCr);
+                            int depth, int dcn, bool swapBlue, bool isCbCr,
+                            AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtBGRtoXYZ(const uchar * src_data, size_t src_step,
                             uchar * dst_data, size_t dst_step,
@@ -190,28 +201,33 @@ CV_EXPORTS void cvtLabtoBGR(const uchar * src_data, size_t src_step,
 CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                                     uchar * dst_data, size_t dst_step,
                                     int dst_width, int dst_height,
-                                    int dcn, bool swapBlue, int uIdx);
+                                    int dcn, bool swapBlue, int uIdx,
+                                    AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 //! Separate Y and UV planes
 CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, const uchar * uv_data, size_t src_step,
                                     uchar * dst_data, size_t dst_step,
                                     int dst_width, int dst_height,
-                                    int dcn, bool swapBlue, int uIdx);
+                                    int dcn, bool swapBlue, int uIdx,
+                                    AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtTwoPlaneYUVtoBGR(const uchar * y_data, size_t y_step, const uchar * uv_data, size_t uv_step,
                                     uchar * dst_data, size_t dst_step,
                                     int dst_width, int dst_height,
-                                    int dcn, bool swapBlue, int uIdx);
+                                    int dcn, bool swapBlue, int uIdx,
+                                    AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtThreePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                                       uchar * dst_data, size_t dst_step,
                                       int dst_width, int dst_height,
-                                      int dcn, bool swapBlue, int uIdx);
+                                      int dcn, bool swapBlue, int uIdx,
+                                      AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtBGRtoThreePlaneYUV(const uchar * src_data, size_t src_step,
                                       uchar * dst_data, size_t dst_step,
                                       int width, int height,
-                                      int scn, bool swapBlue, int uIdx);
+                                      int scn, bool swapBlue, int uIdx,
+                                      AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 //! Separate Y and UV planes
 CV_EXPORTS void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
@@ -222,7 +238,14 @@ CV_EXPORTS void cvtBGRtoTwoPlaneYUV(const uchar * src_data, size_t src_step,
 CV_EXPORTS void cvtOnePlaneYUVtoBGR(const uchar * src_data, size_t src_step,
                                     uchar * dst_data, size_t dst_step,
                                     int width, int height,
-                                    int dcn, bool swapBlue, int uIdx, int ycn);
+                                    int dcn, bool swapBlue, int uIdx, int ycn,
+                                    AlgorithmHint hint = ALGO_HINT_DEFAULT);
+
+CV_EXPORTS void cvtOnePlaneBGRtoYUV(const uchar * src_data, size_t src_step,
+                                    uchar * dst_data, size_t dst_step,
+                                    int width, int height,
+                                    int scn, bool swapBlue, int uIdx, int ycn,
+                                    AlgorithmHint hint = ALGO_HINT_DEFAULT);
 
 CV_EXPORTS void cvtRGBAtoMultipliedRGBA(const uchar * src_data, size_t src_step,
                                         uchar * dst_data, size_t dst_step,
diff --git a/3rdParty/opencv2/imgproc/hal/interface.h b/3rdParty/opencv2/imgproc/hal/interface.h
index ddd7bb5681..29773aa34d 100644
--- a/3rdParty/opencv2/imgproc/hal/interface.h
+++ b/3rdParty/opencv2/imgproc/hal/interface.h
@@ -12,6 +12,12 @@
 #define CV_HAL_INTER_CUBIC 2
 #define CV_HAL_INTER_AREA 3
 #define CV_HAL_INTER_LANCZOS4 4
+#define CV_HAL_INTER_LINEAR_EXACT 5
+#define CV_HAL_INTER_NEAREST_EXACT 6
+#define CV_HAL_INTER_MAX 7
+#define CV_HAL_WARP_FILL_OUTLIERS 8
+#define CV_HAL_WARP_INVERSE_MAP 16
+#define CV_HAL_WARP_RELATIVE_MAP 32
 //! @}
 
 //! @name Morphology operations
diff --git a/3rdParty/opencv2/imgproc/imgproc_c.h b/3rdParty/opencv2/imgproc/imgproc_c.h
index 43918839ae..1dbb6d7059 100644
--- a/3rdParty/opencv2/imgproc/imgproc_c.h
+++ b/3rdParty/opencv2/imgproc/imgproc_c.h
@@ -209,13 +209,17 @@ CVAPI(void)  cvCvtColor( const CvArr* src, CvArr* dst, int code );
 CVAPI(void)  cvResize( const CvArr* src, CvArr* dst,
                        int interpolation CV_DEFAULT( CV_INTER_LINEAR ));
 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
 /** @brief Warps image with affine transform
 @note ::cvGetQuadrangleSubPix is similar to ::cvWarpAffine, but the outliers are extrapolated using
 replication border mode.
 @see cv::warpAffine
 */
 CVAPI(void)  cvWarpAffine( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
-                           int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                           int flags CV_DEFAULT(+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
                            CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
 
 /** @brief Computes affine transform matrix for mapping src[i] to dst[i] (i=0,1,2)
@@ -235,7 +239,7 @@ CVAPI(CvMat*)  cv2DRotationMatrix( CvPoint2D32f center, double angle,
 @see cv::warpPerspective
 */
 CVAPI(void)  cvWarpPerspective( const CvArr* src, CvArr* dst, const CvMat* map_matrix,
-                                int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                                int flags CV_DEFAULT(+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
                                 CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
 
 /** @brief Computes perspective transform matrix for mapping src[i] to dst[i] (i=0,1,2,3)
@@ -250,7 +254,7 @@ CVAPI(CvMat*) cvGetPerspectiveTransform( const CvPoint2D32f* src,
 */
 CVAPI(void)  cvRemap( const CvArr* src, CvArr* dst,
                       const CvArr* mapx, const CvArr* mapy,
-                      int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
+                      int flags CV_DEFAULT(+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS),
                       CvScalar fillval CV_DEFAULT(cvScalarAll(0)) );
 
 /** @brief Converts mapx & mapy from floating-point to integer formats for cvRemap
@@ -264,14 +268,18 @@ CVAPI(void)  cvConvertMaps( const CvArr* mapx, const CvArr* mapy,
 */
 CVAPI(void)  cvLogPolar( const CvArr* src, CvArr* dst,
                          CvPoint2D32f center, double M,
-                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+                         int flags CV_DEFAULT(+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
 
 /** Performs forward or inverse linear-polar image transform
 @see cv::warpPolar
 */
 CVAPI(void)  cvLinearPolar( const CvArr* src, CvArr* dst,
                          CvPoint2D32f center, double maxRadius,
-                         int flags CV_DEFAULT(CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+                         int flags CV_DEFAULT(+CV_INTER_LINEAR+CV_WARP_FILL_OUTLIERS));
+
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
 
 /** @brief Returns a structuring element of the specified size and shape for morphological operations.
 
diff --git a/3rdParty/opencv2/imgproc/segmentation.hpp b/3rdParty/opencv2/imgproc/segmentation.hpp
index ad62a1b71a..916b0e3bbc 100644
--- a/3rdParty/opencv2/imgproc/segmentation.hpp
+++ b/3rdParty/opencv2/imgproc/segmentation.hpp
@@ -121,7 +121,7 @@ class CV_EXPORTS_W_SIMPLE IntelligentScissorsMB
      *
      * @param targetPt The target point
      * @param[out] contour The list of pixels which contains optimal path between the source and the target points of the image. Type is CV_32SC2 (compatible with `std::vector<Point>`)
-     * @param backward Flag to indicate reverse order of retrived pixels (use "true" value to fetch points from the target to the source point)
+     * @param backward Flag to indicate reverse order of retrieved pixels (use "true" value to fetch points from the target to the source point)
      */
     CV_WRAP void getContour(const Point& targetPt, OutputArray contour, bool backward = false) const;
 
diff --git a/3rdParty/opencv2/imgproc/types_c.h b/3rdParty/opencv2/imgproc/types_c.h
index 2982885df3..af25ac1c1b 100644
--- a/3rdParty/opencv2/imgproc/types_c.h
+++ b/3rdParty/opencv2/imgproc/types_c.h
@@ -376,8 +376,9 @@ enum
 /** ... and other image warping flags */
 enum
 {
-    CV_WARP_FILL_OUTLIERS =8,
-    CV_WARP_INVERSE_MAP  =16
+    CV_WARP_FILL_OUTLIERS = 8,
+    CV_WARP_INVERSE_MAP   = 16,
+    CV_WARP_RELATIVE_MAP  = 32
 };
 
 /** Shapes of a structuring element for morphological operations
diff --git a/3rdParty/opencv2/objdetect.hpp b/3rdParty/opencv2/objdetect.hpp
index af43e430ae..15364fc88b 100644
--- a/3rdParty/opencv2/objdetect.hpp
+++ b/3rdParty/opencv2/objdetect.hpp
@@ -45,6 +45,8 @@
 #define OPENCV_OBJDETECT_HPP
 
 #include "opencv2/core.hpp"
+#include "opencv2/objdetect/aruco_detector.hpp"
+#include "opencv2/objdetect/graphical_code_detector.hpp"
 
 /**
 @defgroup objdetect Object Detection
@@ -52,59 +54,82 @@
 @{
     @defgroup objdetect_cascade_classifier Cascade Classifier for Object Detection
 
-The object detector described below has been initially proposed by Paul Viola @cite Viola01 and
-improved by Rainer Lienhart @cite Lienhart02 .
-
-First, a classifier (namely a *cascade of boosted classifiers working with haar-like features*) is
-trained with a few hundred sample views of a particular object (i.e., a face or a car), called
-positive examples, that are scaled to the same size (say, 20x20), and negative examples - arbitrary
-images of the same size.
-
-After a classifier is trained, it can be applied to a region of interest (of the same size as used
-during the training) in an input image. The classifier outputs a "1" if the region is likely to show
-the object (i.e., face/car), and "0" otherwise. To search for the object in the whole image one can
-move the search window across the image and check every location using the classifier. The
-classifier is designed so that it can be easily "resized" in order to be able to find the objects of
-interest at different sizes, which is more efficient than resizing the image itself. So, to find an
-object of an unknown size in the image the scan procedure should be done several times at different
-scales.
-
-The word "cascade" in the classifier name means that the resultant classifier consists of several
-simpler classifiers (*stages*) that are applied subsequently to a region of interest until at some
-stage the candidate is rejected or all the stages are passed. The word "boosted" means that the
-classifiers at every stage of the cascade are complex themselves and they are built out of basic
-classifiers using one of four different boosting techniques (weighted voting). Currently Discrete
-Adaboost, Real Adaboost, Gentle Adaboost and Logitboost are supported. The basic classifiers are
-decision-tree classifiers with at least 2 leaves. Haar-like features are the input to the basic
-classifiers, and are calculated as described below. The current algorithm uses the following
-Haar-like features:
-
-![image](pics/haarfeatures.png)
-
-The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within
-the region of interest and the scale (this scale is not the same as the scale used at the detection
-stage, though these two scales are multiplied). For example, in the case of the third line feature
-(2c) the response is calculated as the difference between the sum of image pixels under the
-rectangle covering the whole feature (including the two white stripes and the black stripe in the
-middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to
-compensate for the differences in the size of areas. The sums of pixel values over a rectangular
-regions are calculated rapidly using integral images (see below and the integral description).
-
-Check @ref tutorial_cascade_classifier "the corresponding tutorial" for more details.
-
-The following reference is for the detection part only. There is a separate application called
-opencv_traincascade that can train a cascade of boosted classifiers from a set of samples.
-
-@note In the new C++ interface it is also possible to use LBP (local binary pattern) features in
-addition to Haar-like features. .. [Viola01] Paul Viola and Michael J. Jones. Rapid Object Detection
-using a Boosted Cascade of Simple Features. IEEE CVPR, 2001. The paper is available online at
-<https://github.com/SvHey/thesis/blob/master/Literature/ObjectDetection/violaJones_CVPR2001.pdf>
+    The object detector described below has been initially proposed by Paul Viola @cite Viola01 and
+    improved by Rainer Lienhart @cite Lienhart02 .
+
+    First, a classifier (namely a *cascade of boosted classifiers working with haar-like features*) is
+    trained with a few hundred sample views of a particular object (i.e., a face or a car), called
+    positive examples, that are scaled to the same size (say, 20x20), and negative examples - arbitrary
+    images of the same size.
+
+    After a classifier is trained, it can be applied to a region of interest (of the same size as used
+    during the training) in an input image. The classifier outputs a "1" if the region is likely to show
+    the object (i.e., face/car), and "0" otherwise. To search for the object in the whole image one can
+    move the search window across the image and check every location using the classifier. The
+    classifier is designed so that it can be easily "resized" in order to be able to find the objects of
+    interest at different sizes, which is more efficient than resizing the image itself. So, to find an
+    object of an unknown size in the image the scan procedure should be done several times at different
+    scales.
+
+    The word "cascade" in the classifier name means that the resultant classifier consists of several
+    simpler classifiers (*stages*) that are applied subsequently to a region of interest until at some
+    stage the candidate is rejected or all the stages are passed. The word "boosted" means that the
+    classifiers at every stage of the cascade are complex themselves and they are built out of basic
+    classifiers using one of four different boosting techniques (weighted voting). Currently Discrete
+    Adaboost, Real Adaboost, Gentle Adaboost and Logitboost are supported. The basic classifiers are
+    decision-tree classifiers with at least 2 leaves. Haar-like features are the input to the basic
+    classifiers, and are calculated as described below. The current algorithm uses the following
+    Haar-like features:
+
+    ![image](pics/haarfeatures.png)
+
+    The feature used in a particular classifier is specified by its shape (1a, 2b etc.), position within
+    the region of interest and the scale (this scale is not the same as the scale used at the detection
+    stage, though these two scales are multiplied). For example, in the case of the third line feature
+    (2c) the response is calculated as the difference between the sum of image pixels under the
+    rectangle covering the whole feature (including the two white stripes and the black stripe in the
+    middle) and the sum of the image pixels under the black stripe multiplied by 3 in order to
+    compensate for the differences in the size of areas. The sums of pixel values over a rectangular
+    regions are calculated rapidly using integral images (see below and the integral description).
+
+    Check @ref tutorial_cascade_classifier "the corresponding tutorial" for more details.
+
+    The following reference is for the detection part only. There is a separate application called
+    opencv_traincascade that can train a cascade of boosted classifiers from a set of samples.
+
+    @note In the new C++ interface it is also possible to use LBP (local binary pattern) features in
+    addition to Haar-like features. .. [Viola01] Paul Viola and Michael J. Jones. Rapid Object Detection
+    using a Boosted Cascade of Simple Features. IEEE CVPR, 2001. The paper is available online at
+    <https://github.com/SvHey/thesis/blob/master/Literature/ObjectDetection/violaJones_CVPR2001.pdf>
 
     @defgroup objdetect_hog HOG (Histogram of Oriented Gradients) descriptor and object detector
+    @defgroup objdetect_barcode Barcode detection and decoding
     @defgroup objdetect_qrcode QRCode detection and encoding
     @defgroup objdetect_dnn_face DNN-based face detection and recognition
-Check @ref tutorial_dnn_face "the corresponding tutorial" for more details.
+
+    Check @ref tutorial_dnn_face "the corresponding tutorial" for more details.
+
     @defgroup objdetect_common Common functions and classes
+    @defgroup objdetect_aruco ArUco markers and boards detection for robust camera pose estimation
+    @{
+        ArUco Marker Detection
+        Square fiducial markers (also known as Augmented Reality Markers) are useful for easy,
+        fast and robust camera pose estimation.
+
+        The main functionality of ArucoDetector class is detection of markers in an image. If the markers are grouped
+        as a board, then you can try to recover the missing markers with ArucoDetector::refineDetectedMarkers().
+        ArUco markers can also be used for advanced chessboard corner finding. To do this, group the markers in the
+        CharucoBoard and find the corners of the chessboard with the CharucoDetector::detectBoard().
+
+        The implementation is based on the ArUco Library by R. Muñoz-Salinas and S. Garrido-Jurado @cite Aruco2014.
+
+        Markers can also be detected based on the AprilTag 2 @cite wang2016iros fiducial detection method.
+
+        @sa @cite Aruco2014
+        This code has been originally developed by Sergio Garrido-Jurado as a project
+        for Google Summer of Code 2015 (GSoC 15).
+    @}
+
 @}
  */
 
@@ -707,19 +732,21 @@ class CV_EXPORTS_W QRCodeEncoder {
         ECI_UTF8 = 26
     };
 
-    /** @brief QR code encoder parameters.
-     @param version The optional version of QR code (by default - maximum possible depending on
-                    the length of the string).
-     @param correction_level The optional level of error correction (by default - the lowest).
-     @param mode The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
-     @param structure_number The optional number of QR codes to generate in Structured Append mode.
-    */
+    /** @brief QR code encoder parameters. */
     struct CV_EXPORTS_W_SIMPLE Params
     {
         CV_WRAP Params();
+
+        //! The optional version of QR code (by default - maximum possible depending on the length of the string).
         CV_PROP_RW int version;
+
+        //! The optional level of error correction (by default - the lowest).
         CV_PROP_RW CorrectionLevel correction_level;
+
+        //! The optional encoding mode - Numeric, Alphanumeric, Byte, Kanji, ECI or Structured Append.
         CV_PROP_RW EncodeMode mode;
+
+        //! The optional number of QR codes to generate in Structured Append mode.
         CV_PROP_RW int structure_number;
     };
 
@@ -742,38 +769,27 @@ class CV_EXPORTS_W QRCodeEncoder {
     CV_WRAP virtual void encodeStructuredAppend(const String& encoded_info, OutputArrayOfArrays qrcodes) = 0;
 
 };
-
-class CV_EXPORTS_W QRCodeDetector
+class CV_EXPORTS_W_SIMPLE QRCodeDetector : public GraphicalCodeDetector
 {
 public:
     CV_WRAP QRCodeDetector();
-    ~QRCodeDetector();
 
     /** @brief sets the epsilon used during the horizontal scan of QR code stop marker detection.
      @param epsX Epsilon neighborhood, which allows you to determine the horizontal pattern
      of the scheme 1:1:3:1:1 according to QR code standard.
     */
-    CV_WRAP void setEpsX(double epsX);
+    CV_WRAP QRCodeDetector& setEpsX(double epsX);
     /** @brief sets the epsilon used during the vertical scan of QR code stop marker detection.
      @param epsY Epsilon neighborhood, which allows you to determine the vertical pattern
      of the scheme 1:1:3:1:1 according to QR code standard.
      */
-    CV_WRAP void setEpsY(double epsY);
+    CV_WRAP QRCodeDetector& setEpsY(double epsY);
 
-    /** @brief Detects QR code in image and returns the quadrangle containing the code.
-     @param img grayscale or color (BGR) image containing (or not) QR code.
-     @param points Output vector of vertices of the minimum-area quadrangle containing the code.
+    /** @brief use markers to improve the position of the corners of the QR code
+     *
+     * alignmentMarkers using by default
      */
-    CV_WRAP bool detect(InputArray img, OutputArray points) const;
-
-    /** @brief Decodes QR code in image once it's found by the detect() method.
-
-     Returns UTF8-encoded output string or empty string if the code cannot be decoded.
-     @param img grayscale or color (BGR) image containing QR code.
-     @param points Quadrangle vertices found by detect() method (or some other algorithm).
-     @param straight_qrcode The optional output image containing rectified and binarized QR code
-     */
-    CV_WRAP std::string decode(InputArray img, InputArray points, OutputArray straight_qrcode = noArray());
+    CV_WRAP QRCodeDetector& setUseAlignmentMarkers(bool useAlignmentMarkers);
 
     /** @brief Decodes QR code on a curved surface in image once it's found by the detect() method.
 
@@ -784,15 +800,6 @@ class CV_EXPORTS_W QRCodeDetector
      */
     CV_WRAP cv::String decodeCurved(InputArray img, InputArray points, OutputArray straight_qrcode = noArray());
 
-    /** @brief Both detects and decodes QR code
-
-     @param img grayscale or color (BGR) image containing QR code.
-     @param points optional output array of vertices of the found QR code quadrangle. Will be empty if not found.
-     @param straight_qrcode The optional output image containing rectified and binarized QR code
-     */
-    CV_WRAP std::string detectAndDecode(InputArray img, OutputArray points=noArray(),
-                                        OutputArray straight_qrcode = noArray());
-
     /** @brief Both detects and decodes QR code on a curved surface
 
      @param img grayscale or color (BGR) image containing QR code.
@@ -801,43 +808,58 @@ class CV_EXPORTS_W QRCodeDetector
      */
     CV_WRAP std::string detectAndDecodeCurved(InputArray img, OutputArray points=noArray(),
                                               OutputArray straight_qrcode = noArray());
+};
 
-    /** @brief Detects QR codes in image and returns the vector of the quadrangles containing the codes.
-     @param img grayscale or color (BGR) image containing (or not) QR codes.
-     @param points Output vector of vector of vertices of the minimum-area quadrangle containing the codes.
-     */
-    CV_WRAP
-    bool detectMulti(InputArray img, OutputArray points) const;
-
-    /** @brief Decodes QR codes in image once it's found by the detect() method.
-     @param img grayscale or color (BGR) image containing QR codes.
-     @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
-     @param points vector of Quadrangle vertices found by detect() method (or some other algorithm).
-     @param straight_qrcode The optional output vector of images containing rectified and binarized QR codes
-     */
-    CV_WRAP
-    bool decodeMulti(
-            InputArray img, InputArray points,
-            CV_OUT std::vector<std::string>& decoded_info,
-            OutputArrayOfArrays straight_qrcode = noArray()
-    ) const;
-
-    /** @brief Both detects and decodes QR codes
-    @param img grayscale or color (BGR) image containing QR codes.
-    @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
-    @param points optional output vector of vertices of the found QR code quadrangles. Will be empty if not found.
-    @param straight_qrcode The optional output vector of images containing rectified and binarized QR codes
-    */
-    CV_WRAP
-    bool detectAndDecodeMulti(
-            InputArray img, CV_OUT std::vector<std::string>& decoded_info,
-            OutputArray points = noArray(),
-            OutputArrayOfArrays straight_qrcode = noArray()
-    ) const;
+class CV_EXPORTS_W_SIMPLE QRCodeDetectorAruco : public GraphicalCodeDetector {
+public:
+    CV_WRAP QRCodeDetectorAruco();
 
-protected:
-    struct Impl;
-    Ptr<Impl> p;
+    struct CV_EXPORTS_W_SIMPLE Params {
+        CV_WRAP Params();
+
+        /** @brief The minimum allowed pixel size of a QR module in the smallest image in the image pyramid, default 4.f */
+        CV_PROP_RW float minModuleSizeInPyramid;
+
+        /** @brief The maximum allowed relative rotation for finder patterns in the same QR code, default pi/12 */
+        CV_PROP_RW float maxRotation;
+
+        /** @brief The maximum allowed relative mismatch in module sizes for finder patterns in the same QR code, default 1.75f */
+        CV_PROP_RW float maxModuleSizeMismatch;
+
+        /** @brief The maximum allowed module relative mismatch for timing pattern module, default 2.f
+         *
+         * If relative mismatch of timing pattern module more this value, penalty points will be added.
+         * If a lot of penalty points are added, QR code will be rejected. */
+        CV_PROP_RW float maxTimingPatternMismatch;
+
+        /** @brief The maximum allowed percentage of penalty points out of total pins in timing pattern, default 0.4f */
+        CV_PROP_RW float maxPenalties;
+
+        /** @brief The maximum allowed relative color mismatch in the timing pattern, default 0.2f*/
+        CV_PROP_RW float maxColorsMismatch;
+
+        /** @brief The algorithm find QR codes with almost minimum timing pattern score and minimum size, default 0.9f
+         *
+         * The QR code with the minimum "timing pattern score" and minimum "size" is selected as the best QR code.
+         * If for the current QR code "timing pattern score" * scaleTimingPatternScore < "previous timing pattern score" and "size" < "previous size", then
+         * current QR code set as the best QR code. */
+        CV_PROP_RW float scaleTimingPatternScore;
+    };
+
+    /** @brief QR code detector constructor for Aruco-based algorithm. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP explicit QRCodeDetectorAruco(const QRCodeDetectorAruco::Params& params);
+
+    /** @brief Detector parameters getter. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP const QRCodeDetectorAruco::Params& getDetectorParameters() const;
+
+    /** @brief Detector parameters setter. See cv::QRCodeDetectorAruco::Params */
+    CV_WRAP QRCodeDetectorAruco& setDetectorParameters(const QRCodeDetectorAruco::Params& params);
+
+    /** @brief Aruco detector parameters are used to search for the finder patterns. */
+    CV_WRAP const aruco::DetectorParameters& getArucoParameters() const;
+
+    /** @brief Aruco detector parameters are used to search for the finder patterns. */
+    CV_WRAP void setArucoParameters(const aruco::DetectorParameters& params);
 };
 
 //! @}
@@ -845,5 +867,7 @@ class CV_EXPORTS_W QRCodeDetector
 
 #include "opencv2/objdetect/detection_based_tracker.hpp"
 #include "opencv2/objdetect/face.hpp"
+#include "opencv2/objdetect/charuco_detector.hpp"
+#include "opencv2/objdetect/barcode.hpp"
 
 #endif
diff --git a/3rdParty/opencv2/objdetect/aruco_board.hpp b/3rdParty/opencv2/objdetect/aruco_board.hpp
new file mode 100644
index 0000000000..e8300c82bf
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/aruco_board.hpp
@@ -0,0 +1,199 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_ARUCO_BOARD_HPP
+#define OPENCV_OBJDETECT_ARUCO_BOARD_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace aruco {
+//! @addtogroup objdetect_aruco
+//! @{
+
+class Dictionary;
+
+/** @brief Board of ArUco markers
+ *
+ * A board is a set of markers in the 3D space with a common coordinate system.
+ * The common form of a board of marker is a planar (2D) board, however any 3D layout can be used.
+ * A Board object is composed by:
+ * - The object points of the marker corners, i.e. their coordinates respect to the board system.
+ * - The dictionary which indicates the type of markers of the board
+ * - The identifier of all the markers in the board.
+ */
+class CV_EXPORTS_W_SIMPLE Board {
+public:
+    /** @brief Common Board constructor
+     *
+     * @param objPoints array of object points of all the marker corners in the board
+     * @param dictionary the dictionary of markers employed for this board
+     * @param ids vector of the identifiers of the markers in the board
+     */
+    CV_WRAP Board(InputArrayOfArrays objPoints, const Dictionary& dictionary, InputArray ids);
+
+    /** @brief return the Dictionary of markers employed for this board
+     */
+    CV_WRAP const Dictionary& getDictionary() const;
+
+    /** @brief return array of object points of all the marker corners in the board.
+     *
+     * Each marker include its 4 corners in this order:
+     * -   objPoints[i][0] - left-top point of i-th marker
+     * -   objPoints[i][1] - right-top point of i-th marker
+     * -   objPoints[i][2] - right-bottom point of i-th marker
+     * -   objPoints[i][3] - left-bottom point of i-th marker
+     *
+     * Markers are placed in a certain order - row by row, left to right in every row. For M markers, the size is Mx4.
+     */
+    CV_WRAP const std::vector<std::vector<Point3f> >& getObjPoints() const;
+
+    /** @brief vector of the identifiers of the markers in the board (should be the same size as objPoints)
+     * @return vector of the identifiers of the markers
+     */
+    CV_WRAP const std::vector<int>& getIds() const;
+
+    /** @brief get coordinate of the bottom right corner of the board, is set when calling the function create()
+     */
+    CV_WRAP const Point3f& getRightBottomCorner() const;
+
+    /** @brief Given a board configuration and a set of detected markers, returns the corresponding
+     * image points and object points, can be used in solvePnP()
+     *
+     * @param detectedCorners List of detected marker corners of the board.
+     * For cv::Board and cv::GridBoard the method expects std::vector<std::vector<Point2f>> or std::vector<Mat> with Aruco marker corners.
+     * For cv::CharucoBoard the method expects std::vector<Point2f> or Mat with ChAruco corners (chess board corners matched with Aruco markers).
+     *
+     * @param detectedIds List of identifiers for each marker or charuco corner.
+     * For any Board class the method expects std::vector<int> or Mat.
+     *
+     * @param objPoints Vector of marker points in the board coordinate space.
+     * For any Board class the method expects std::vector<cv::Point3f> objectPoints or cv::Mat
+     *
+     * @param imgPoints Vector of marker points in the image coordinate space.
+     * For any Board class the method expects std::vector<cv::Point2f> objectPoints or cv::Mat
+     *
+     * @sa solvePnP
+     */
+    CV_WRAP void matchImagePoints(InputArrayOfArrays detectedCorners, InputArray detectedIds,
+                                  OutputArray objPoints, OutputArray imgPoints) const;
+
+     /** @brief Draw a planar board
+     *
+     * @param outSize size of the output image in pixels.
+     * @param img output image with the board. The size of this image will be outSize
+     * and the board will be on the center, keeping the board proportions.
+     * @param marginSize minimum margins (in pixels) of the board in the output image
+     * @param borderBits width of the marker borders.
+     *
+     * This function return the image of the board, ready to be printed.
+     */
+    CV_WRAP void generateImage(Size outSize, OutputArray img, int marginSize = 0, int borderBits = 1) const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    Board();
+
+    struct Impl;
+protected:
+    Board(const Ptr<Impl>& impl);
+    Ptr<Impl> impl;
+};
+
+/** @brief Planar board with grid arrangement of markers
+ *
+ * More common type of board. All markers are placed in the same plane in a grid arrangement.
+ * The board image can be drawn using generateImage() method.
+ */
+class CV_EXPORTS_W_SIMPLE GridBoard : public Board {
+public:
+    /**
+     * @brief GridBoard constructor
+     *
+     * @param size number of markers in x and y directions
+     * @param markerLength marker side length (normally in meters)
+     * @param markerSeparation separation between two markers (same unit as markerLength)
+     * @param dictionary dictionary of markers indicating the type of markers
+     * @param ids set of marker ids in dictionary to use on board.
+     */
+    CV_WRAP GridBoard(const Size& size, float markerLength, float markerSeparation,
+                      const Dictionary &dictionary, InputArray ids = noArray());
+
+    CV_WRAP Size getGridSize() const;
+    CV_WRAP float getMarkerLength() const;
+    CV_WRAP float getMarkerSeparation() const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    GridBoard();
+};
+
+/**
+ * @brief ChArUco board is a planar chessboard where the markers are placed inside the white squares of a chessboard.
+ *
+ * The benefits of ChArUco boards is that they provide both, ArUco markers versatility and chessboard corner precision,
+ * which is important for calibration and pose estimation. The board image can be drawn using generateImage() method.
+ */
+class CV_EXPORTS_W_SIMPLE CharucoBoard : public Board {
+public:
+    /** @brief CharucoBoard constructor
+     *
+     * @param size number of chessboard squares in x and y directions
+     * @param squareLength squareLength chessboard square side length (normally in meters)
+     * @param markerLength marker side length (same unit than squareLength)
+     * @param dictionary dictionary of markers indicating the type of markers
+     * @param ids array of id used markers
+     * The first markers in the dictionary are used to fill the white chessboard squares.
+     */
+    CV_WRAP CharucoBoard(const Size& size, float squareLength, float markerLength,
+                         const Dictionary &dictionary, InputArray ids = noArray());
+
+    /** @brief set legacy chessboard pattern.
+     *
+     * Legacy setting creates chessboard patterns starting with a white box in the upper left corner
+     * if there is an even row count of chessboard boxes, otherwise it starts with a black box.
+     * This setting ensures compatibility to patterns created with OpenCV versions prior OpenCV 4.6.0.
+     * See https://github.com/opencv/opencv/issues/23152.
+     *
+     * Default value: false.
+     */
+    CV_WRAP void setLegacyPattern(bool legacyPattern);
+    CV_WRAP bool getLegacyPattern() const;
+
+    CV_WRAP Size getChessboardSize() const;
+    CV_WRAP float getSquareLength() const;
+    CV_WRAP float getMarkerLength() const;
+
+    /** @brief get CharucoBoard::chessboardCorners
+     */
+    CV_WRAP std::vector<Point3f> getChessboardCorners() const;
+
+    /** @brief get CharucoBoard::nearestMarkerIdx, for each charuco corner, nearest marker index in ids array
+     */
+    CV_PROP std::vector<std::vector<int> > getNearestMarkerIdx() const;
+
+    /** @brief get CharucoBoard::nearestMarkerCorners, for each charuco corner, nearest marker corner id of each marker
+     */
+    CV_PROP std::vector<std::vector<int> > getNearestMarkerCorners() const;
+
+    /** @brief check whether the ChArUco markers are collinear
+     *
+     * @param charucoIds list of identifiers for each corner in charucoCorners per frame.
+     * @return bool value, 1 (true) if detected corners form a line, 0 (false) if they do not.
+     * solvePnP, calibration functions will fail if the corners are collinear (true).
+     *
+     * The number of ids in charucoIDs should be <= the number of chessboard corners in the board.
+     * This functions checks whether the charuco corners are on a straight line (returns true, if so), or not (false).
+     * Axis parallel, as well as diagonal and other straight lines detected.  Degenerate cases:
+     * for number of charucoIDs <= 2,the function returns true.
+     */
+    CV_WRAP bool checkCharucoCornersCollinear(InputArray charucoIds) const;
+
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    CharucoBoard();
+};
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdParty/opencv2/objdetect/aruco_detector.hpp b/3rdParty/opencv2/objdetect/aruco_detector.hpp
new file mode 100644
index 0000000000..9d30d55d17
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/aruco_detector.hpp
@@ -0,0 +1,400 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_ARUCO_DETECTOR_HPP
+#define OPENCV_OBJDETECT_ARUCO_DETECTOR_HPP
+
+#include <opencv2/objdetect/aruco_dictionary.hpp>
+#include <opencv2/objdetect/aruco_board.hpp>
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+enum CornerRefineMethod{
+    CORNER_REFINE_NONE,     ///< Tag and corners detection based on the ArUco approach
+    CORNER_REFINE_SUBPIX,   ///< ArUco approach and refine the corners locations using corner subpixel accuracy
+    CORNER_REFINE_CONTOUR,  ///< ArUco approach and refine the corners locations using the contour-points line fitting
+    CORNER_REFINE_APRILTAG, ///< Tag and corners detection based on the AprilTag 2 approach @cite wang2016iros
+};
+
+/** @brief struct DetectorParameters is used by ArucoDetector
+ */
+struct CV_EXPORTS_W_SIMPLE DetectorParameters {
+    CV_WRAP DetectorParameters() {
+        adaptiveThreshWinSizeMin = 3;
+        adaptiveThreshWinSizeMax = 23;
+        adaptiveThreshWinSizeStep = 10;
+        adaptiveThreshConstant = 7;
+        minMarkerPerimeterRate = 0.03;
+        maxMarkerPerimeterRate = 4.;
+        polygonalApproxAccuracyRate = 0.03;
+        minCornerDistanceRate = 0.05;
+        minDistanceToBorder = 3;
+        minMarkerDistanceRate = 0.125;
+        cornerRefinementMethod = (int)CORNER_REFINE_NONE;
+        cornerRefinementWinSize = 5;
+        relativeCornerRefinmentWinSize = 0.3f;
+        cornerRefinementMaxIterations = 30;
+        cornerRefinementMinAccuracy = 0.1;
+        markerBorderBits = 1;
+        perspectiveRemovePixelPerCell = 4;
+        perspectiveRemoveIgnoredMarginPerCell = 0.13;
+        maxErroneousBitsInBorderRate = 0.35;
+        minOtsuStdDev = 5.0;
+        errorCorrectionRate = 0.6;
+        aprilTagQuadDecimate = 0.0;
+        aprilTagQuadSigma = 0.0;
+        aprilTagMinClusterPixels = 5;
+        aprilTagMaxNmaxima = 10;
+        aprilTagCriticalRad = (float)(10* CV_PI /180);
+        aprilTagMaxLineFitMse = 10.0;
+        aprilTagMinWhiteBlackDiff = 5;
+        aprilTagDeglitch = 0;
+        detectInvertedMarker = false;
+        useAruco3Detection = false;
+        minSideLengthCanonicalImg = 32;
+        minMarkerLengthRatioOriginalImg = 0.0;
+    }
+
+    /** @brief Read a new set of DetectorParameters from FileNode (use FileStorage.root()).
+     */
+    CV_WRAP bool readDetectorParameters(const FileNode& fn);
+
+    /** @brief Write a set of DetectorParameters to FileStorage
+     */
+    CV_WRAP bool writeDetectorParameters(FileStorage& fs, const String& name = String());
+
+    /// minimum window size for adaptive thresholding before finding contours (default 3).
+    CV_PROP_RW int adaptiveThreshWinSizeMin;
+
+    /// maximum window size for adaptive thresholding before finding contours (default 23).
+    CV_PROP_RW int adaptiveThreshWinSizeMax;
+
+    /// increments from adaptiveThreshWinSizeMin to adaptiveThreshWinSizeMax during the thresholding (default 10).
+    CV_PROP_RW int adaptiveThreshWinSizeStep;
+
+    /// constant for adaptive thresholding before finding contours (default 7)
+    CV_PROP_RW double adaptiveThreshConstant;
+
+    /** @brief determine minimum perimeter for marker contour to be detected.
+     *
+     * This is defined as a rate respect to the maximum dimension of the input image (default 0.03).
+     */
+    CV_PROP_RW double minMarkerPerimeterRate;
+
+    /** @brief determine maximum perimeter for marker contour to be detected.
+     *
+     * This is defined as a rate respect to the maximum dimension of the input image (default 4.0).
+     */
+    CV_PROP_RW double maxMarkerPerimeterRate;
+
+    /// minimum accuracy during the polygonal approximation process to determine which contours are squares. (default 0.03)
+    CV_PROP_RW double polygonalApproxAccuracyRate;
+
+    /// minimum distance between corners for detected markers relative to its perimeter (default 0.05)
+    CV_PROP_RW double minCornerDistanceRate;
+
+    /// minimum distance of any corner to the image border for detected markers (in pixels) (default 3)
+    CV_PROP_RW int minDistanceToBorder;
+
+    /** @brief minimum average distance between the corners of the two markers to be grouped (default 0.125).
+     *
+     * The rate is relative to the smaller perimeter of the two markers.
+     * Two markers are grouped if average distance between the corners of the two markers is less than
+     * min(MarkerPerimeter1, MarkerPerimeter2)*minMarkerDistanceRate.
+     *
+     * default value is 0.125 because 0.125*MarkerPerimeter = (MarkerPerimeter / 4) * 0.5 = half the side of the marker.
+     *
+     * @note default value was changed from 0.05 after 4.8.1 release, because the filtering algorithm has been changed.
+     * Now a few candidates from the same group can be added to the list of candidates if they are far from each other.
+     * @sa minGroupDistance.
+     */
+    CV_PROP_RW double minMarkerDistanceRate;
+
+    /** @brief minimum average distance between the corners of the two markers in group to add them to the list of candidates
+     *
+     * The average distance between the corners of the two markers is calculated relative to its module size (default 0.21).
+     */
+    CV_PROP_RW float minGroupDistance = 0.21f;
+
+    /** @brief default value CORNER_REFINE_NONE */
+    CV_PROP_RW int cornerRefinementMethod;
+
+    /** @brief maximum window size for the corner refinement process (in pixels) (default 5).
+     *
+     * The window size may decrease if the ArUco marker is too small, check relativeCornerRefinmentWinSize.
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     */
+    CV_PROP_RW int cornerRefinementWinSize;
+
+    /** @brief Dynamic window size for corner refinement relative to Aruco module size (default 0.3).
+     *
+     * The final window size is calculated as:
+     * min(cornerRefinementWinSize, averageArucoModuleSize*relativeCornerRefinmentWinSize),
+     * where averageArucoModuleSize is average module size of ArUco marker in pixels.
+     * (ArUco marker is composed of black and white modules)
+     * In the case of markers located far from each other, it may be useful to increase the value of the parameter to 0.4-0.5.
+     * In the case of markers located close to each other, it may be useful to decrease the parameter value to 0.1-0.2.
+     */
+    CV_PROP_RW float relativeCornerRefinmentWinSize;
+
+    /// maximum number of iterations for stop criteria of the corner refinement process (default 30).
+    CV_PROP_RW int cornerRefinementMaxIterations;
+
+    /// minimum error for the stop cristeria of the corner refinement process (default: 0.1)
+    CV_PROP_RW double cornerRefinementMinAccuracy;
+
+    /// number of bits of the marker border, i.e. marker border width (default 1).
+    CV_PROP_RW int markerBorderBits;
+
+    /// number of bits (per dimension) for each cell of the marker when removing the perspective (default 4).
+    CV_PROP_RW int perspectiveRemovePixelPerCell;
+
+    /** @brief width of the margin of pixels on each cell not considered for the determination of the cell bit.
+     *
+     * Represents the rate respect to the total size of the cell, i.e. perspectiveRemovePixelPerCell (default 0.13)
+     */
+    CV_PROP_RW double perspectiveRemoveIgnoredMarginPerCell;
+
+    /** @brief  maximum number of accepted erroneous bits in the border (i.e. number of allowed white bits in the border).
+     *
+     * Represented as a rate respect to the total number of bits per marker (default 0.35).
+     */
+    CV_PROP_RW double maxErroneousBitsInBorderRate;
+
+    /** @brief minimun standard deviation in pixels values during the decodification step to apply Otsu
+     * thresholding (otherwise, all the bits are set to 0 or 1 depending on mean higher than 128 or not) (default 5.0)
+     */
+    CV_PROP_RW double minOtsuStdDev;
+
+    /// error correction rate respect to the maximun error correction capability for each dictionary (default 0.6).
+    CV_PROP_RW double errorCorrectionRate;
+
+    /** @brief April :: User-configurable parameters.
+     *
+     * Detection of quads can be done on a lower-resolution image, improving speed at a cost of
+     * pose accuracy and a slight decrease in detection rate. Decoding the binary payload is still
+     */
+    CV_PROP_RW float aprilTagQuadDecimate;
+
+    /// what Gaussian blur should be applied to the segmented image (used for quad detection?)
+    CV_PROP_RW float aprilTagQuadSigma;
+
+    // April :: Internal variables
+    /// reject quads containing too few pixels (default 5).
+    CV_PROP_RW int aprilTagMinClusterPixels;
+
+    /// how many corner candidates to consider when segmenting a group of pixels into a quad (default 10).
+    CV_PROP_RW int aprilTagMaxNmaxima;
+
+    /** @brief reject quads where pairs of edges have angles that are close to straight or close to 180 degrees.
+     *
+     * Zero means that no quads are rejected. (In radians) (default 10*PI/180)
+     */
+    CV_PROP_RW float aprilTagCriticalRad;
+
+    /// when fitting lines to the contours, what is the maximum mean squared error
+    CV_PROP_RW float aprilTagMaxLineFitMse;
+
+    /** @brief add an extra check that the white model must be (overall) brighter than the black model.
+     *
+     * When we build our model of black & white pixels, we add an extra check that the white model must be (overall)
+     * brighter than the black model. How much brighter? (in pixel values, [0,255]), (default 5)
+     */
+    CV_PROP_RW int aprilTagMinWhiteBlackDiff;
+
+    /// should the thresholded image be deglitched? Only useful for very noisy images (default 0).
+    CV_PROP_RW int aprilTagDeglitch;
+
+    /** @brief to check if there is a white marker.
+     *
+     * In order to generate a "white" marker just invert a normal marker by using a tilde, ~markerImage. (default false)
+     */
+    CV_PROP_RW bool detectInvertedMarker;
+
+    /** @brief enable the new and faster Aruco detection strategy.
+     *
+     * Proposed in the paper:
+     * Romero-Ramirez et al: Speeded up detection of squared fiducial markers (2018)
+     * https://www.researchgate.net/publication/325787310_Speeded_Up_Detection_of_Squared_Fiducial_Markers
+     */
+    CV_PROP_RW bool useAruco3Detection;
+
+    /// minimum side length of a marker in the canonical image. Latter is the binarized image in which contours are searched.
+    CV_PROP_RW int minSideLengthCanonicalImg;
+
+    /// range [0,1], eq (2) from paper. The parameter tau_i has a direct influence on the processing speed.
+    CV_PROP_RW float minMarkerLengthRatioOriginalImg;
+};
+
+/** @brief struct RefineParameters is used by ArucoDetector
+ */
+struct CV_EXPORTS_W_SIMPLE RefineParameters {
+    CV_WRAP RefineParameters(float minRepDistance = 10.f, float errorCorrectionRate = 3.f, bool checkAllOrders = true);
+
+
+    /** @brief Read a new set of RefineParameters from FileNode (use FileStorage.root()).
+     */
+    CV_WRAP bool readRefineParameters(const FileNode& fn);
+
+    /** @brief Write a set of RefineParameters to FileStorage
+     */
+    CV_WRAP bool writeRefineParameters(FileStorage& fs, const String& name = String());
+
+    /** @brief minRepDistance minimum distance between the corners of the rejected candidate and the reprojected marker
+    in order to consider it as a correspondence.
+     */
+    CV_PROP_RW float minRepDistance;
+
+    /** @brief errorCorrectionRate rate of allowed erroneous bits respect to the error correction capability of the used dictionary.
+     *
+     * -1 ignores the error correction step.
+     */
+    CV_PROP_RW float errorCorrectionRate;
+
+    /** @brief checkAllOrders consider the four posible corner orders in the rejectedCorners array.
+     *
+     * If it set to false, only the provided corner order is considered (default true).
+     */
+    CV_PROP_RW bool checkAllOrders;
+};
+
+/** @brief The main functionality of ArucoDetector class is detection of markers in an image with detectMarkers() method.
+ *
+ * After detecting some markers in the image, you can try to find undetected markers from this dictionary with
+ * refineDetectedMarkers() method.
+ *
+ * @see DetectorParameters, RefineParameters
+ */
+class CV_EXPORTS_W ArucoDetector : public Algorithm
+{
+public:
+    /** @brief Basic ArucoDetector constructor
+     *
+     * @param dictionary indicates the type of markers that will be searched
+     * @param detectorParams marker detection parameters
+     * @param refineParams marker refine detection parameters
+     */
+    CV_WRAP ArucoDetector(const Dictionary &dictionary = getPredefinedDictionary(cv::aruco::DICT_4X4_50),
+                          const DetectorParameters &detectorParams = DetectorParameters(),
+                          const RefineParameters& refineParams = RefineParameters());
+
+    /** @brief Basic marker detection
+     *
+     * @param image input image
+     * @param corners vector of detected marker corners. For each marker, its four corners
+     * are provided, (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers,
+     * the dimensions of this array is Nx4. The order of the corners is clockwise.
+     * @param ids vector of identifiers of the detected markers. The identifier is of type int
+     * (e.g. std::vector<int>). For N detected markers, the size of ids is also N.
+     * The identifiers have the same order than the markers in the imgPoints array.
+     * @param rejectedImgPoints contains the imgPoints of those squares whose inner code has not a
+     * correct codification. Useful for debugging purposes.
+     *
+     * Performs marker detection in the input image. Only markers included in the specific dictionary
+     * are searched. For each detected marker, it returns the 2D position of its corner in the image
+     * and its corresponding identifier.
+     * Note that this function does not perform pose estimation.
+     * @note The function does not correct lens distortion or takes it into account. It's recommended to undistort
+     * input image with corresponding camera model, if camera parameters are known
+     * @sa undistort, estimatePoseSingleMarkers,  estimatePoseBoard
+     */
+    CV_WRAP void detectMarkers(InputArray image, OutputArrayOfArrays corners, OutputArray ids,
+                               OutputArrayOfArrays rejectedImgPoints = noArray()) const;
+
+    /** @brief Refine not detected markers based on the already detected and the board layout
+     *
+     * @param image input image
+     * @param board layout of markers in the board.
+     * @param detectedCorners vector of already detected marker corners.
+     * @param detectedIds vector of already detected marker identifiers.
+     * @param rejectedCorners vector of rejected candidates during the marker detection process.
+     * @param cameraMatrix optional input 3x3 floating-point camera matrix
+     * \f$A = \vecthreethree{f_x}{0}{c_x}{0}{f_y}{c_y}{0}{0}{1}\f$
+     * @param distCoeffs optional vector of distortion coefficients
+     * \f$(k_1, k_2, p_1, p_2[, k_3[, k_4, k_5, k_6],[s_1, s_2, s_3, s_4]])\f$ of 4, 5, 8 or 12 elements
+     * @param recoveredIdxs Optional array to returns the indexes of the recovered candidates in the
+     * original rejectedCorners array.
+     *
+     * This function tries to find markers that were not detected in the basic detecMarkers function.
+     * First, based on the current detected marker and the board layout, the function interpolates
+     * the position of the missing markers. Then it tries to find correspondence between the reprojected
+     * markers and the rejected candidates based on the minRepDistance and errorCorrectionRate parameters.
+     * If camera parameters and distortion coefficients are provided, missing markers are reprojected
+     * using projectPoint function. If not, missing marker projections are interpolated using global
+     * homography, and all the marker corners in the board must have the same Z coordinate.
+     */
+    CV_WRAP void refineDetectedMarkers(InputArray image, const Board &board,
+                                       InputOutputArrayOfArrays detectedCorners,
+                                       InputOutputArray detectedIds, InputOutputArrayOfArrays rejectedCorners,
+                                       InputArray cameraMatrix = noArray(), InputArray distCoeffs = noArray(),
+                                       OutputArray recoveredIdxs = noArray()) const;
+
+    CV_WRAP const Dictionary& getDictionary() const;
+    CV_WRAP void setDictionary(const Dictionary& dictionary);
+
+    CV_WRAP const DetectorParameters& getDetectorParameters() const;
+    CV_WRAP void setDetectorParameters(const DetectorParameters& detectorParameters);
+
+    CV_WRAP const RefineParameters& getRefineParameters() const;
+    CV_WRAP void setRefineParameters(const RefineParameters& refineParameters);
+
+    /** @brief Stores algorithm parameters in a file storage
+    */
+    virtual void write(FileStorage& fs) const override;
+
+    /** @brief simplified API for language bindings
+    */
+    CV_WRAP inline void write(FileStorage& fs, const String& name) { Algorithm::write(fs, name); }
+
+    /** @brief Reads algorithm parameters from a file storage
+    */
+    CV_WRAP virtual void read(const FileNode& fn) override;
+protected:
+    struct ArucoDetectorImpl;
+    Ptr<ArucoDetectorImpl> arucoDetectorImpl;
+};
+
+/** @brief Draw detected markers in image
+ *
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not altered.
+ * @param corners positions of marker corners on input image.
+ * (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers, the dimensions of
+ * this array should be Nx4. The order of the corners should be clockwise.
+ * @param ids vector of identifiers for markers in markersCorners .
+ * Optional, if not provided, ids are not painted.
+ * @param borderColor color of marker borders. Rest of colors (text color and first corner color)
+ * are calculated based on this one to improve visualization.
+ *
+ * Given an array of detected marker corners and its corresponding ids, this functions draws
+ * the markers in the image. The marker borders are painted and the markers identifiers if provided.
+ * Useful for debugging purposes.
+ */
+CV_EXPORTS_W void drawDetectedMarkers(InputOutputArray image, InputArrayOfArrays corners,
+                                      InputArray ids = noArray(), Scalar borderColor = Scalar(0, 255, 0));
+
+/** @brief Generate a canonical marker image
+ *
+ * @param dictionary dictionary of markers indicating the type of markers
+ * @param id identifier of the marker that will be returned. It has to be a valid id in the specified dictionary.
+ * @param sidePixels size of the image in pixels
+ * @param img output image with the marker
+ * @param borderBits width of the marker border.
+ *
+ * This function returns a marker image in its canonical form (i.e. ready to be printed)
+ */
+CV_EXPORTS_W void generateImageMarker(const Dictionary &dictionary, int id, int sidePixels, OutputArray img,
+                                      int borderBits = 1);
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdParty/opencv2/objdetect/aruco_dictionary.hpp b/3rdParty/opencv2/objdetect/aruco_dictionary.hpp
new file mode 100644
index 0000000000..bc7b934b2a
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/aruco_dictionary.hpp
@@ -0,0 +1,155 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_DICTIONARY_HPP
+#define OPENCV_OBJDETECT_DICTIONARY_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+
+/** @brief Dictionary is a set of unique ArUco markers of the same size
+ *
+ * `bytesList` storing as 2-dimensions Mat with 4-th channels (CV_8UC4 type was used) and contains the marker codewords where:
+ * - bytesList.rows is the dictionary size
+ * - each marker is encoded using `nbytes = ceil(markerSize*markerSize/8.)` bytes
+ * - each row contains all 4 rotations of the marker, so its length is `4*nbytes`
+ * - the byte order in the bytesList[i] row:
+ * `//bytes without rotation/bytes with rotation 1/bytes with rotation 2/bytes with rotation 3//`
+ * So `bytesList.ptr(i)[k*nbytes + j]` is the j-th byte of i-th marker, in its k-th rotation.
+ * @note Python bindings generate matrix with shape of bytesList `dictionary_size x nbytes x 4`,
+ * but it should be indexed like C++ version. Python example for j-th byte of i-th marker, in its k-th rotation:
+ * `aruco_dict.bytesList[id].ravel()[k*nbytes + j]`
+ */
+class CV_EXPORTS_W_SIMPLE Dictionary {
+
+    public:
+    CV_PROP_RW Mat bytesList;         ///< marker code information. See class description for more details
+    CV_PROP_RW int markerSize;        ///< number of bits per dimension
+    CV_PROP_RW int maxCorrectionBits; ///< maximum number of bits that can be corrected
+
+    CV_WRAP Dictionary();
+
+    /** @brief Basic ArUco dictionary constructor
+     *
+     * @param bytesList bits for all ArUco markers in dictionary see memory layout in the class description
+     * @param _markerSize ArUco marker size in units
+     * @param maxcorr maximum number of bits that can be corrected
+     */
+    CV_WRAP Dictionary(const Mat &bytesList, int _markerSize, int maxcorr = 0);
+
+    /** @brief Read a new dictionary from FileNode.
+     *
+     * Dictionary example in YAML format:\n
+     * nmarkers: 35\n
+     * markersize: 6\n
+     * maxCorrectionBits: 5\n
+     * marker_0: "101011111011111001001001101100000000"\n
+     * ...\n
+     * marker_34: "011111010000111011111110110101100101"
+     */
+    CV_WRAP bool readDictionary(const cv::FileNode& fn);
+
+    /** @brief Write a dictionary to FileStorage, format is the same as in readDictionary().
+     */
+    CV_WRAP void writeDictionary(FileStorage& fs, const String& name = String());
+
+    /** @brief Given a matrix of bits. Returns whether if marker is identified or not.
+     *
+     * Returns reference to the marker id in the dictionary (if any) and its rotation.
+     */
+    CV_WRAP bool identify(const Mat &onlyBits, CV_OUT int &idx, CV_OUT int &rotation, double maxCorrectionRate) const;
+
+    /** @brief Returns Hamming distance of the input bits to the specific id.
+     *
+     * If `allRotations` flag is set, the four posible marker rotations are considered
+     */
+    CV_WRAP int getDistanceToId(InputArray bits, int id, bool allRotations = true) const;
+
+
+    /** @brief Generate a canonical marker image
+     */
+    CV_WRAP void generateImageMarker(int id, int sidePixels, OutputArray _img, int borderBits = 1) const;
+
+
+    /** @brief Transform matrix of bits to list of bytes with 4 marker rotations
+      */
+    CV_WRAP static Mat getByteListFromBits(const Mat &bits);
+
+
+    /** @brief Transform list of bytes to matrix of bits
+      */
+    CV_WRAP static Mat getBitsFromByteList(const Mat &byteList, int markerSize);
+};
+
+
+
+
+/** @brief Predefined markers dictionaries/sets
+ *
+ * Each dictionary indicates the number of bits and the number of markers contained
+ * - DICT_ARUCO_ORIGINAL: standard ArUco Library Markers. 1024 markers, 5x5 bits, 0 minimum
+                          distance
+ */
+enum PredefinedDictionaryType {
+    DICT_4X4_50 = 0,        ///< 4x4 bits, minimum hamming distance between any two codes = 4, 50 codes
+    DICT_4X4_100,           ///< 4x4 bits, minimum hamming distance between any two codes = 3, 100 codes
+    DICT_4X4_250,           ///< 4x4 bits, minimum hamming distance between any two codes = 3, 250 codes
+    DICT_4X4_1000,          ///< 4x4 bits, minimum hamming distance between any two codes = 2, 1000 codes
+    DICT_5X5_50,            ///< 5x5 bits, minimum hamming distance between any two codes = 8, 50 codes
+    DICT_5X5_100,           ///< 5x5 bits, minimum hamming distance between any two codes = 7, 100 codes
+    DICT_5X5_250,           ///< 5x5 bits, minimum hamming distance between any two codes = 6, 250 codes
+    DICT_5X5_1000,          ///< 5x5 bits, minimum hamming distance between any two codes = 5, 1000 codes
+    DICT_6X6_50,            ///< 6x6 bits, minimum hamming distance between any two codes = 13, 50 codes
+    DICT_6X6_100,           ///< 6x6 bits, minimum hamming distance between any two codes = 12, 100 codes
+    DICT_6X6_250,           ///< 6x6 bits, minimum hamming distance between any two codes = 11, 250 codes
+    DICT_6X6_1000,          ///< 6x6 bits, minimum hamming distance between any two codes = 9, 1000 codes
+    DICT_7X7_50,            ///< 7x7 bits, minimum hamming distance between any two codes = 19, 50 codes
+    DICT_7X7_100,           ///< 7x7 bits, minimum hamming distance between any two codes = 18, 100 codes
+    DICT_7X7_250,           ///< 7x7 bits, minimum hamming distance between any two codes = 17, 250 codes
+    DICT_7X7_1000,          ///< 7x7 bits, minimum hamming distance between any two codes = 14, 1000 codes
+    DICT_ARUCO_ORIGINAL,    ///< 6x6 bits, minimum hamming distance between any two codes = 3, 1024 codes
+    DICT_APRILTAG_16h5,     ///< 4x4 bits, minimum hamming distance between any two codes = 5, 30 codes
+    DICT_APRILTAG_25h9,     ///< 5x5 bits, minimum hamming distance between any two codes = 9, 35 codes
+    DICT_APRILTAG_36h10,    ///< 6x6 bits, minimum hamming distance between any two codes = 10, 2320 codes
+    DICT_APRILTAG_36h11,     ///< 6x6 bits, minimum hamming distance between any two codes = 11, 587 codes
+    DICT_ARUCO_MIP_36h12     ///< 6x6 bits, minimum hamming distance between any two codes = 12, 250 codes
+};
+
+
+/** @brief Returns one of the predefined dictionaries defined in PredefinedDictionaryType
+  */
+CV_EXPORTS Dictionary getPredefinedDictionary(PredefinedDictionaryType name);
+
+
+/** @brief Returns one of the predefined dictionaries referenced by DICT_*.
+  */
+CV_EXPORTS_W Dictionary getPredefinedDictionary(int dict);
+
+/** @brief Extend base dictionary by new nMarkers
+  *
+  * @param nMarkers number of markers in the dictionary
+  * @param markerSize number of bits per dimension of each markers
+  * @param baseDictionary Include the markers in this dictionary at the beginning (optional)
+  * @param randomSeed a user supplied seed for theRNG()
+  *
+  * This function creates a new dictionary composed by nMarkers markers and each markers composed
+  * by markerSize x markerSize bits. If baseDictionary is provided, its markers are directly
+  * included and the rest are generated based on them. If the size of baseDictionary is higher
+  * than nMarkers, only the first nMarkers in baseDictionary are taken and no new marker is added.
+  */
+CV_EXPORTS_W Dictionary extendDictionary(int nMarkers, int markerSize, const Dictionary &baseDictionary = Dictionary(),
+                                         int randomSeed=0);
+
+
+
+//! @}
+}
+}
+
+#endif
diff --git a/3rdParty/opencv2/objdetect/barcode.hpp b/3rdParty/opencv2/objdetect/barcode.hpp
new file mode 100644
index 0000000000..c20b67c0b2
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/barcode.hpp
@@ -0,0 +1,111 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+// Copyright (c) 2020-2021 darkliang wangberlinT Certseeds
+
+#ifndef OPENCV_OBJDETECT_BARCODE_HPP
+#define OPENCV_OBJDETECT_BARCODE_HPP
+
+#include <opencv2/core.hpp>
+#include <opencv2/objdetect/graphical_code_detector.hpp>
+
+namespace cv {
+namespace barcode {
+
+//! @addtogroup objdetect_barcode
+//! @{
+
+class CV_EXPORTS_W_SIMPLE BarcodeDetector : public cv::GraphicalCodeDetector
+{
+public:
+    /** @brief Initialize the BarcodeDetector.
+    */
+    CV_WRAP BarcodeDetector();
+    /** @brief Initialize the BarcodeDetector.
+     *
+     * Parameters allow to load _optional_ Super Resolution DNN model for better quality.
+     * @param prototxt_path prototxt file path for the super resolution model
+     * @param model_path model file path for the super resolution model
+     */
+    CV_WRAP BarcodeDetector(CV_WRAP_FILE_PATH const std::string &prototxt_path, CV_WRAP_FILE_PATH const std::string &model_path);
+    ~BarcodeDetector();
+
+    /** @brief Decodes barcode in image once it's found by the detect() method.
+     *
+     * @param img grayscale or color (BGR) image containing bar code.
+     * @param points vector of rotated rectangle vertices found by detect() method (or some other algorithm).
+     * For N detected barcodes, the dimensions of this array should be [N][4].
+     * Order of four points in vector<Point2f> is bottomLeft, topLeft, topRight, bottomRight.
+     * @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+     * @param decoded_type vector strings, specifies the type of these barcodes
+     * @return true if at least one valid barcode have been found
+     */
+    CV_WRAP bool decodeWithType(InputArray img,
+                             InputArray points,
+                             CV_OUT std::vector<std::string> &decoded_info,
+                             CV_OUT std::vector<std::string> &decoded_type) const;
+
+    /** @brief Both detects and decodes barcode
+
+     * @param img grayscale or color (BGR) image containing barcode.
+     * @param decoded_info UTF8-encoded output vector of string(s) or empty vector of string if the codes cannot be decoded.
+     * @param decoded_type vector of strings, specifies the type of these barcodes
+     * @param points optional output vector of vertices of the found  barcode rectangle. Will be empty if not found.
+     * @return true if at least one valid barcode have been found
+     */
+    CV_WRAP bool detectAndDecodeWithType(InputArray img,
+                                      CV_OUT std::vector<std::string> &decoded_info,
+                                      CV_OUT std::vector<std::string> &decoded_type,
+                                      OutputArray points = noArray()) const;
+
+    /** @brief Get detector downsampling threshold.
+     *
+     * @return detector downsampling threshold
+     */
+    CV_WRAP double getDownsamplingThreshold() const;
+
+    /** @brief Set detector downsampling threshold.
+     *
+     * By default, the detect method resizes the input image to this limit if the smallest image size is is greater than the threshold.
+     * Increasing this value can improve detection accuracy and the number of results at the expense of performance.
+     * Correlates with detector scales. Setting this to a large value will disable downsampling.
+     * @param thresh downsampling limit to apply (default 512)
+     * @see setDetectorScales
+     */
+    CV_WRAP BarcodeDetector& setDownsamplingThreshold(double thresh);
+
+    /** @brief Returns detector box filter sizes.
+     *
+     * @param sizes output parameter for returning the sizes.
+     */
+    CV_WRAP void getDetectorScales(CV_OUT std::vector<float>& sizes) const;
+
+    /** @brief Set detector box filter sizes.
+     *
+     * Adjusts the value and the number of box filters used in the detect step.
+     * The filter sizes directly correlate with the expected line widths for a barcode. Corresponds to expected barcode distance.
+     * If the downsampling limit is increased, filter sizes need to be adjusted in an inversely proportional way.
+     * @param sizes box filter sizes, relative to minimum dimension of the image (default [0.01, 0.03, 0.06, 0.08])
+     */
+    CV_WRAP BarcodeDetector& setDetectorScales(const std::vector<float>& sizes);
+
+    /** @brief Get detector gradient magnitude threshold.
+     *
+     * @return detector gradient magnitude threshold.
+     */
+    CV_WRAP double getGradientThreshold() const;
+
+    /** @brief Set detector gradient magnitude threshold.
+     *
+     * Sets the coherence threshold for detected bounding boxes.
+     * Increasing this value will generate a closer fitted bounding box width and can reduce false-positives.
+     * Values between 16 and 1024 generally work, while too high of a value will remove valid detections.
+     * @param thresh gradient magnitude threshold (default 64).
+     */
+    CV_WRAP BarcodeDetector& setGradientThreshold(double thresh);
+};
+//! @}
+
+}} // cv::barcode::
+
+#endif // OPENCV_OBJDETECT_BARCODE_HPP
diff --git a/3rdParty/opencv2/objdetect/charuco_detector.hpp b/3rdParty/opencv2/objdetect/charuco_detector.hpp
new file mode 100644
index 0000000000..e10cb3f025
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/charuco_detector.hpp
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_CHARUCO_DETECTOR_HPP
+#define OPENCV_OBJDETECT_CHARUCO_DETECTOR_HPP
+
+#include "opencv2/objdetect/aruco_detector.hpp"
+
+namespace cv {
+namespace aruco {
+
+//! @addtogroup objdetect_aruco
+//! @{
+
+struct CV_EXPORTS_W_SIMPLE CharucoParameters {
+    CV_WRAP CharucoParameters() {
+        minMarkers = 2;
+        tryRefineMarkers = false;
+    }
+    /// cameraMatrix optional 3x3 floating-point camera matrix
+    CV_PROP_RW Mat cameraMatrix;
+
+    /// distCoeffs optional vector of distortion coefficients
+    CV_PROP_RW Mat distCoeffs;
+
+    /// minMarkers number of adjacent markers that must be detected to return a charuco corner, default = 2
+    CV_PROP_RW int minMarkers;
+
+    /// try to use refine board, default false
+    CV_PROP_RW bool tryRefineMarkers;
+};
+
+class CV_EXPORTS_W CharucoDetector : public Algorithm {
+public:
+    /** @brief Basic CharucoDetector constructor
+     *
+     * @param board ChAruco board
+     * @param charucoParams charuco detection parameters
+     * @param detectorParams marker detection parameters
+     * @param refineParams marker refine detection parameters
+     */
+    CV_WRAP CharucoDetector(const CharucoBoard& board,
+                            const CharucoParameters& charucoParams = CharucoParameters(),
+                            const DetectorParameters &detectorParams = DetectorParameters(),
+                            const RefineParameters& refineParams = RefineParameters());
+
+    CV_WRAP const CharucoBoard& getBoard() const;
+    CV_WRAP void setBoard(const CharucoBoard& board);
+
+    CV_WRAP const CharucoParameters& getCharucoParameters() const;
+    CV_WRAP void setCharucoParameters(CharucoParameters& charucoParameters);
+
+    CV_WRAP const DetectorParameters& getDetectorParameters() const;
+    CV_WRAP void setDetectorParameters(const DetectorParameters& detectorParameters);
+
+    CV_WRAP const RefineParameters& getRefineParameters() const;
+    CV_WRAP void setRefineParameters(const RefineParameters& refineParameters);
+
+    /**
+     * @brief detect aruco markers and interpolate position of ChArUco board corners
+     * @param image input image necesary for corner refinement. Note that markers are not detected and
+     * should be sent in corners and ids parameters.
+     * @param charucoCorners interpolated chessboard corners.
+     * @param charucoIds interpolated chessboard corners identifiers.
+     * @param markerCorners vector of already detected markers corners. For each marker, its four
+     * corners are provided, (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers, the
+     * dimensions of this array should be Nx4. The order of the corners should be clockwise.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     * @param markerIds list of identifiers for each marker in corners.
+     *  If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     *
+     * This function receives the detected markers and returns the 2D position of the chessboard corners
+     * from a ChArUco board using the detected Aruco markers.
+     *
+     * If markerCorners and markerCorners are empty, the detectMarkers() will run and detect aruco markers and ids.
+     *
+     * If camera parameters are provided, the process is based in an approximated pose estimation, else it is based on local homography.
+     * Only visible corners are returned. For each corner, its corresponding identifier is also returned in charucoIds.
+     * @sa findChessboardCorners
+     * @note After OpenCV 4.6.0, there was an incompatible change in the ChArUco pattern generation algorithm for even row counts.
+     * Use cv::aruco::CharucoBoard::setLegacyPattern() to ensure compatibility with patterns created using OpenCV versions prior to 4.6.0.
+     * For more information, see the issue: https://github.com/opencv/opencv/issues/23152
+     */
+    CV_WRAP void detectBoard(InputArray image, OutputArray charucoCorners, OutputArray charucoIds,
+                             InputOutputArrayOfArrays markerCorners = noArray(),
+                             InputOutputArray markerIds = noArray()) const;
+
+    /**
+     * @brief Detect ChArUco Diamond markers
+     *
+     * @param image input image necessary for corner subpixel.
+     * @param diamondCorners output list of detected diamond corners (4 corners per diamond). The order
+     * is the same than in marker corners: top left, top right, bottom right and bottom left. Similar
+     * format than the corners returned by detectMarkers (e.g std::vector<std::vector<cv::Point2f> > ).
+     * @param diamondIds ids of the diamonds in diamondCorners. The id of each diamond is in fact of
+     * type Vec4i, so each diamond has 4 ids, which are the ids of the aruco markers composing the
+     * diamond.
+     * @param markerCorners list of detected marker corners from detectMarkers function.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     * @param markerIds list of marker ids in markerCorners.
+     * If markerCorners and markerCorners are empty, the function detect aruco markers and ids.
+     *
+     * This function detects Diamond markers from the previous detected ArUco markers. The diamonds
+     * are returned in the diamondCorners and diamondIds parameters. If camera calibration parameters
+     * are provided, the diamond search is based on reprojection. If not, diamond search is based on
+     * homography. Homography is faster than reprojection, but less accurate.
+     */
+    CV_WRAP void detectDiamonds(InputArray image, OutputArrayOfArrays diamondCorners, OutputArray diamondIds,
+                                InputOutputArrayOfArrays markerCorners = noArray(),
+                                InputOutputArray markerIds = noArray()) const;
+protected:
+    struct CharucoDetectorImpl;
+    Ptr<CharucoDetectorImpl> charucoDetectorImpl;
+};
+
+/**
+ * @brief Draws a set of Charuco corners
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not
+ * altered.
+ * @param charucoCorners vector of detected charuco corners
+ * @param charucoIds list of identifiers for each corner in charucoCorners
+ * @param cornerColor color of the square surrounding each corner
+ *
+ * This function draws a set of detected Charuco corners. If identifiers vector is provided, it also
+ * draws the id of each corner.
+ */
+CV_EXPORTS_W void drawDetectedCornersCharuco(InputOutputArray image, InputArray charucoCorners,
+                                             InputArray charucoIds = noArray(), Scalar cornerColor = Scalar(255, 0, 0));
+
+/**
+ * @brief Draw a set of detected ChArUco Diamond markers
+ *
+ * @param image input/output image. It must have 1 or 3 channels. The number of channels is not
+ * altered.
+ * @param diamondCorners positions of diamond corners in the same format returned by
+ * detectCharucoDiamond(). (e.g std::vector<std::vector<cv::Point2f> > ). For N detected markers,
+ * the dimensions of this array should be Nx4. The order of the corners should be clockwise.
+ * @param diamondIds vector of identifiers for diamonds in diamondCorners, in the same format
+ * returned by detectCharucoDiamond() (e.g. std::vector<Vec4i>).
+ * Optional, if not provided, ids are not painted.
+ * @param borderColor color of marker borders. Rest of colors (text color and first corner color)
+ * are calculated based on this one.
+ *
+ * Given an array of detected diamonds, this functions draws them in the image. The marker borders
+ * are painted and the markers identifiers if provided.
+ * Useful for debugging purposes.
+ */
+CV_EXPORTS_W void drawDetectedDiamonds(InputOutputArray image, InputArrayOfArrays diamondCorners,
+                                       InputArray diamondIds = noArray(),
+                                       Scalar borderColor = Scalar(0, 0, 255));
+
+//! @}
+
+}
+}
+
+#endif
diff --git a/3rdParty/opencv2/objdetect/detection_based_tracker.hpp b/3rdParty/opencv2/objdetect/detection_based_tracker.hpp
index 3ba2cc8b46..8fe43c2c6b 100644
--- a/3rdParty/opencv2/objdetect/detection_based_tracker.hpp
+++ b/3rdParty/opencv2/objdetect/detection_based_tracker.hpp
@@ -192,7 +192,7 @@ class CV_EXPORTS DetectionBasedTracker
             {
                 lastPositions.push_back(rect);
                 id=getNextId();
-            };
+            }
 
             static int getNextId()
             {
diff --git a/3rdParty/opencv2/objdetect/face.hpp b/3rdParty/opencv2/objdetect/face.hpp
index 69f4aab9f1..3528d73bc6 100644
--- a/3rdParty/opencv2/objdetect/face.hpp
+++ b/3rdParty/opencv2/objdetect/face.hpp
@@ -20,7 +20,7 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceDetectorYN
 {
 public:
-    virtual ~FaceDetectorYN() {};
+    virtual ~FaceDetectorYN() {}
 
     /** @brief Set the size for the network input, which overwrites the input size of creating model. Call this method when the size of input image does not match the input size when creating model
      *
@@ -54,14 +54,24 @@ class CV_EXPORTS_W FaceDetectorYN
 
     CV_WRAP virtual int getTopK() = 0;
 
-    /** @brief A simple interface to detect face from given image
-     *
+    /** @brief Detects faces in the input image. Following is an example output.
+
+     * ![image](pics/lena-face-detection.jpg)
+
      *  @param image an image to detect
-     *  @param faces detection results stored in a cv::Mat
+     *  @param faces detection results stored in a 2D cv::Mat of shape [num_faces, 15]
+     *  - 0-1: x, y of bbox top left corner
+     *  - 2-3: width, height of bbox
+     *  - 4-5: x, y of right eye (blue point in the example image)
+     *  - 6-7: x, y of left eye (red point in the example image)
+     *  - 8-9: x, y of nose tip (green point in the example image)
+     *  - 10-11: x, y of right corner of mouth (pink point in the example image)
+     *  - 12-13: x, y of left corner of mouth (yellow point in the example image)
+     *  - 14: face score
      */
     CV_WRAP virtual int detect(InputArray image, OutputArray faces) = 0;
 
-    /** @brief Creates an instance of this class with given parameters
+    /** @brief Creates an instance of face detector class with given parameters
      *
      *  @param model the path to the requested model
      *  @param config the path to the config file for compability, which is not requested for ONNX models
@@ -72,14 +82,37 @@ class CV_EXPORTS_W FaceDetectorYN
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceDetectorYN> create(const String& model,
-                                              const String& config,
+    CV_WRAP static Ptr<FaceDetectorYN> create(CV_WRAP_FILE_PATH const String& model,
+                                              CV_WRAP_FILE_PATH const String& config,
+                                              const Size& input_size,
+                                              float score_threshold = 0.9f,
+                                              float nms_threshold = 0.3f,
+                                              int top_k = 5000,
+                                              int backend_id = 0,
+                                              int target_id = 0);
+
+    /** @overload
+     *
+     *  @param framework Name of origin framework
+     *  @param bufferModel A buffer with a content of binary file with weights
+     *  @param bufferConfig A buffer with a content of text file contains network configuration
+     *  @param input_size the size of the input image
+     *  @param score_threshold the threshold to filter out bounding boxes of score smaller than the given value
+     *  @param nms_threshold the threshold to suppress bounding boxes of IoU bigger than the given value
+     *  @param top_k keep top K bboxes before NMS
+     *  @param backend_id the id of backend
+     *  @param target_id the id of target device
+     */
+    CV_WRAP static Ptr<FaceDetectorYN> create(const String& framework,
+                                              const std::vector<uchar>& bufferModel,
+                                              const std::vector<uchar>& bufferConfig,
                                               const Size& input_size,
                                               float score_threshold = 0.9f,
                                               float nms_threshold = 0.3f,
                                               int top_k = 5000,
                                               int backend_id = 0,
                                               int target_id = 0);
+
 };
 
 /** @brief DNN-based face recognizer
@@ -89,29 +122,29 @@ model download link: https://github.com/opencv/opencv_zoo/tree/master/models/fac
 class CV_EXPORTS_W FaceRecognizerSF
 {
 public:
-    virtual ~FaceRecognizerSF() {};
+    virtual ~FaceRecognizerSF() {}
 
     /** @brief Definition of distance used for calculating the distance between two face features
      */
     enum DisType { FR_COSINE=0, FR_NORM_L2=1 };
 
-    /** @brief Aligning image to put face on the standard position
+    /** @brief Aligns detected face with the source input image and crops it
      *  @param src_img input image
-     *  @param face_box the detection result used for indicate face in input image
+     *  @param face_box the detected face result from the input image
      *  @param aligned_img output aligned image
      */
     CV_WRAP virtual void alignCrop(InputArray src_img, InputArray face_box, OutputArray aligned_img) const = 0;
 
-    /** @brief Extracting face feature from aligned image
+    /** @brief Extracts face feature from aligned image
      *  @param aligned_img input aligned image
      *  @param face_feature output face feature
      */
     CV_WRAP virtual void feature(InputArray aligned_img, OutputArray face_feature) = 0;
 
-    /** @brief Calculating the distance between two face features
+    /** @brief Calculates the distance between two face features
      *  @param face_feature1 the first input feature
      *  @param face_feature2 the second input feature of the same size and the same type as face_feature1
-     *  @param dis_type defining the similarity with optional values "FR_OSINE" or "FR_NORM_L2"
+     *  @param dis_type defines how to calculate the distance between two face features with optional values "FR_COSINE" or "FR_NORM_L2"
      */
     CV_WRAP virtual double match(InputArray face_feature1, InputArray face_feature2, int dis_type = FaceRecognizerSF::FR_COSINE) const = 0;
 
@@ -121,7 +154,23 @@ class CV_EXPORTS_W FaceRecognizerSF
      *  @param backend_id the id of backend
      *  @param target_id the id of target device
      */
-    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& model, const String& config, int backend_id = 0, int target_id = 0);
+    CV_WRAP static Ptr<FaceRecognizerSF> create(CV_WRAP_FILE_PATH const String& model, CV_WRAP_FILE_PATH const String& config, int backend_id = 0, int target_id = 0);
+
+    /**
+     *  @brief Creates an instance of this class from a buffer containing the model weights and configuration.
+     *  @param framework Name of the framework (ONNX, etc.)
+     *  @param bufferModel A buffer containing the binary model weights.
+     *  @param bufferConfig A buffer containing the network configuration.
+     *  @param backend_id The id of the backend.
+     *  @param target_id The id of the target device.
+     *
+     *  @return A pointer to the created instance of FaceRecognizerSF.
+     */
+    CV_WRAP static Ptr<FaceRecognizerSF> create(const String& framework,
+                                                const std::vector<uchar>& bufferModel,
+                                                const std::vector<uchar>& bufferConfig,
+                                                int backend_id = 0,
+                                                int target_id = 0);
 };
 
 //! @}
diff --git a/3rdParty/opencv2/objdetect/graphical_code_detector.hpp b/3rdParty/opencv2/objdetect/graphical_code_detector.hpp
new file mode 100644
index 0000000000..ed697c50c0
--- /dev/null
+++ b/3rdParty/opencv2/objdetect/graphical_code_detector.hpp
@@ -0,0 +1,85 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+#ifndef OPENCV_OBJDETECT_GRAPHICAL_CODE_DETECTOR_HPP
+#define OPENCV_OBJDETECT_GRAPHICAL_CODE_DETECTOR_HPP
+
+#include <opencv2/core.hpp>
+
+namespace cv {
+
+//! @addtogroup objdetect_common
+//! @{
+
+class CV_EXPORTS_W_SIMPLE GraphicalCodeDetector {
+public:
+    CV_DEPRECATED_EXTERNAL  // avoid using in C++ code, will be moved to "protected" (need to fix bindings first)
+    GraphicalCodeDetector();
+
+    GraphicalCodeDetector(const GraphicalCodeDetector&) = default;
+    GraphicalCodeDetector(GraphicalCodeDetector&&) = default;
+    GraphicalCodeDetector& operator=(const GraphicalCodeDetector&) = default;
+    GraphicalCodeDetector& operator=(GraphicalCodeDetector&&) = default;
+
+    /** @brief Detects graphical code in image and returns the quadrangle containing the code.
+     @param img grayscale or color (BGR) image containing (or not) graphical code.
+     @param points Output vector of vertices of the minimum-area quadrangle containing the code.
+     */
+    CV_WRAP bool detect(InputArray img, OutputArray points) const;
+
+    /** @brief Decodes graphical code in image once it's found by the detect() method.
+
+     Returns UTF8-encoded output string or empty string if the code cannot be decoded.
+     @param img grayscale or color (BGR) image containing graphical code.
+     @param points Quadrangle vertices found by detect() method (or some other algorithm).
+     @param straight_code The optional output image containing binarized code, will be empty if not found.
+     */
+    CV_WRAP std::string decode(InputArray img, InputArray points, OutputArray straight_code = noArray()) const;
+
+    /** @brief Both detects and decodes graphical code
+
+     @param img grayscale or color (BGR) image containing graphical code.
+     @param points optional output array of vertices of the found graphical code quadrangle, will be empty if not found.
+     @param straight_code The optional output image containing binarized code
+     */
+    CV_WRAP std::string detectAndDecode(InputArray img, OutputArray points = noArray(),
+                                        OutputArray straight_code = noArray()) const;
+
+
+    /** @brief Detects graphical codes in image and returns the vector of the quadrangles containing the codes.
+     @param img grayscale or color (BGR) image containing (or not) graphical codes.
+     @param points Output vector of vector of vertices of the minimum-area quadrangle containing the codes.
+     */
+    CV_WRAP bool detectMulti(InputArray img, OutputArray points) const;
+
+    /** @brief Decodes graphical codes in image once it's found by the detect() method.
+     @param img grayscale or color (BGR) image containing graphical codes.
+     @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+     @param points vector of Quadrangle vertices found by detect() method (or some other algorithm).
+     @param straight_code The optional output vector of images containing binarized codes
+     */
+    CV_WRAP bool decodeMulti(InputArray img, InputArray points, CV_OUT std::vector<std::string>& decoded_info,
+                             OutputArrayOfArrays straight_code = noArray()) const;
+
+    /** @brief Both detects and decodes graphical codes
+    @param img grayscale or color (BGR) image containing graphical codes.
+    @param decoded_info UTF8-encoded output vector of string or empty vector of string if the codes cannot be decoded.
+    @param points optional output vector of vertices of the found graphical code quadrangles. Will be empty if not found.
+    @param straight_code The optional vector of images containing binarized codes
+
+    - If there are QR codes encoded with a Structured Append mode on the image and all of them detected and decoded correctly,
+    method writes a full message to position corresponds to 0-th code in a sequence. The rest of QR codes from the same sequence
+    have empty string.
+    */
+    CV_WRAP bool detectAndDecodeMulti(InputArray img, CV_OUT std::vector<std::string>& decoded_info, OutputArray points = noArray(),
+                                      OutputArrayOfArrays straight_code = noArray()) const;
+    struct Impl;
+protected:
+    Ptr<Impl> p;
+};
+
+//! @}
+
+}
+
+#endif
diff --git a/3rdParty/opencv2/photo.hpp b/3rdParty/opencv2/photo.hpp
index b17b2550b1..66f00b7e90 100644
--- a/3rdParty/opencv2/photo.hpp
+++ b/3rdParty/opencv2/photo.hpp
@@ -55,30 +55,29 @@ This module includes photo processing algorithms
     @defgroup photo_denoise Denoising
     @defgroup photo_hdr HDR imaging
 
-This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
-camera calibration with multiple exposures and exposure fusion.
+    This section describes high dynamic range imaging algorithms namely tonemapping, exposure alignment,
+    camera calibration with multiple exposures and exposure fusion.
 
     @defgroup photo_decolor Contrast Preserving Decolorization
 
-Useful links:
+    Useful links:
 
-http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
+    http://www.cse.cuhk.edu.hk/leojia/projects/color2gray/index.html
 
     @defgroup photo_clone Seamless Cloning
 
-Useful links:
+    Useful links:
 
-https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
+    https://www.learnopencv.com/seamless-cloning-using-opencv-python-cpp
 
     @defgroup photo_render Non-Photorealistic Rendering
 
-Useful links:
+    Useful links:
 
-http://www.inf.ufrgs.br/~eslgastal/DomainTransform
+    http://www.inf.ufrgs.br/~eslgastal/DomainTransform
 
-https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
+    https://www.learnopencv.com/non-photorealistic-rendering-using-opencv-python-c/
 
-    @defgroup photo_c C API
 @}
   */
 
@@ -201,8 +200,8 @@ CV_EXPORTS_W void fastNlMeansDenoisingColored( InputArray src, OutputArray dst,
 
 /** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
 captured in small period of time. For example video. This version of the function is for grayscale
-images or for manual manipulation with colorspaces. For more details see
-<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+images or for manual manipulation with colorspaces. See @cite Buades2005DenoisingIS for more details
+(open access [here](https://static.aminer.org/pdf/PDF/000/317/196/spatio_temporal_wiener_filtering_of_image_sequences_using_a_parametric.pdf)).
 
 @param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
 4-channel images sequence. All images should have the same type and
@@ -210,7 +209,7 @@ size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
-imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+imgToDenoiseIndex + temporalWindowSize / 2 from srcImgs will be used to denoise
 srcImgs[imgToDenoiseIndex] image.
 @param dst Output image with the same size and type as srcImgs images.
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
@@ -228,8 +227,8 @@ CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputA
 
 /** @brief Modification of fastNlMeansDenoising function for images sequence where consecutive images have been
 captured in small period of time. For example video. This version of the function is for grayscale
-images or for manual manipulation with colorspaces. For more details see
-<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+images or for manual manipulation with colorspaces. See @cite Buades2005DenoisingIS for more details
+(open access [here](https://static.aminer.org/pdf/PDF/000/317/196/spatio_temporal_wiener_filtering_of_image_sequences_using_a_parametric.pdf)).
 
 @param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
 2-channel, 3-channel or 4-channel images sequence. All images should
@@ -237,7 +236,7 @@ have the same type and size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
-imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+imgToDenoiseIndex + temporalWindowSize / 2 from srcImgs will be used to denoise
 srcImgs[imgToDenoiseIndex] image.
 @param dst Output image with the same size and type as srcImgs images.
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
@@ -264,7 +263,7 @@ size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
-imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+imgToDenoiseIndex + temporalWindowSize / 2 from srcImgs will be used to denoise
 srcImgs[imgToDenoiseIndex] image.
 @param dst Output image with the same size and type as srcImgs images.
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
@@ -709,33 +708,74 @@ CV_EXPORTS_W void decolor( InputArray src, OutputArray grayscale, OutputArray co
 //! @{
 
 
-//! seamlessClone algorithm flags
-enum
+//! Flags for the seamlessClone algorithm
+enum SeamlessCloneFlags
 {
-    /** The power of the method is fully expressed when inserting objects with complex outlines into a new background*/
+    /**
+    @brief Normal seamless cloning.
+    This method is ideal for inserting objects with complex outlines into a new background.
+    It preserves the original appearance and lighting of the inserted object, ensuring a natural blend.
+     */
     NORMAL_CLONE = 1,
-    /** The classic method, color-based selection and alpha masking might be time consuming and often leaves an undesirable
-    halo. Seamless cloning, even averaged with the original image, is not effective. Mixed seamless cloning based on a loose selection proves effective.*/
-    MIXED_CLONE  = 2,
-    /** Monochrome transfer allows the user to easily replace certain features of one object by alternative features.*/
-    MONOCHROME_TRANSFER = 3};
+
+    /**
+    @brief Mixed seamless cloning.
+    This method addresses cases where simple color-based selection or alpha masking is time-consuming
+    and may result in undesirable halos. By combining structure from the source and texture from the
+    destination, mixed seamless cloning is highly effective, even with loosely defined selections.
+     */
+    MIXED_CLONE = 2,
+
+    /**
+    @brief Monochrome transfer cloning.
+    This method allows users to replace specific features of an object, such as grayscale textures
+    or patterns, with alternative features. It is particularly useful for artistic effects or
+    targeted object modifications.
+     */
+    MONOCHROME_TRANSFER = 3,
+
+    /**
+    @brief Enhanced normal seamless cloning.
+    Similar to `NORMAL_CLONE`, but with an advanced approach to ROI (Region of Interest) calculation.
+    This mode processes a larger source region by considering the entire mask area instead of only
+    the bounding rectangle of non-zero pixels.
+     */
+    NORMAL_CLONE_WIDE = 9,
+
+    /**
+    @brief Enhanced mixed seamless cloning.
+    Similar to `MIXED_CLONE`, but with an advanced approach to ROI (Region of Interest) calculation.
+    This mode processes a larger source region by considering the entire mask area instead of only
+    the bounding rectangle of non-zero pixels.
+     */
+    MIXED_CLONE_WIDE = 10,
+
+    /**
+    @brief Enhanced monochrome transfer cloning.
+    Similar to `MONOCHROME_TRANSFER`, but with an advanced approach to ROI (Region of Interest) calculation.
+    This mode processes a larger source region by considering the entire mask area instead of only
+    the bounding rectangle of non-zero pixels.
+     */
+    MONOCHROME_TRANSFER_WIDE = 11
+};
 
 
 /** @example samples/cpp/tutorial_code/photo/seamless_cloning/cloning_demo.cpp
 An example using seamlessClone function
 */
-/** @brief Image editing tasks concern either global changes (color/intensity corrections, filters,
-deformations) or local changes concerned to a selection. Here we are interested in achieving local
-changes, ones that are restricted to a region manually selected (ROI), in a seamless and effortless
-manner. The extent of the changes ranges from slight distortions to complete replacement by novel
-content @cite PM03 .
-
-@param src Input 8-bit 3-channel image.
-@param dst Input 8-bit 3-channel image.
-@param mask Input 8-bit 1 or 3-channel image.
-@param p Point in dst image where object is placed.
-@param blend Output image with the same size and type as dst.
-@param flags Cloning method that could be cv::NORMAL_CLONE, cv::MIXED_CLONE or cv::MONOCHROME_TRANSFER
+/** @brief Performs seamless cloning to blend a region from a source image into a destination image.
+This function is designed for local image editing, allowing changes restricted to a region
+(manually selected as the ROI) to be applied effortlessly and seamlessly. These changes can
+range from slight distortions to complete replacement by novel content @cite PM03.
+
+@param src The source image (8-bit 3-channel), from which a region will be blended into the destination.
+@param dst The destination image (8-bit 3-channel), where the src image will be blended.
+@param mask A binary mask (8-bit, 1, 3, or 4-channel) specifying the region in the source image to blend.
+Non-zero pixels indicate the region to be blended. If an empty Mat is provided, a mask with
+all non-zero pixels is created internally.
+@param p The point where the center of the src image is placed in the dst image.
+@param blend The output image that stores the result of the seamless cloning. It has the same size and type as `dst`.
+@param flags Flags that control the type of cloning method, can take values of `cv::SeamlessCloneFlags`.
  */
 CV_EXPORTS_W void seamlessClone( InputArray src, InputArray dst, InputArray mask, Point p,
         OutputArray blend, int flags);
diff --git a/3rdParty/opencv2/photo/cuda.hpp b/3rdParty/opencv2/photo/cuda.hpp
index c879acd640..cb1dd9caff 100644
--- a/3rdParty/opencv2/photo/cuda.hpp
+++ b/3rdParty/opencv2/photo/cuda.hpp
@@ -65,11 +65,20 @@ BORDER_REPLICATE , BORDER_CONSTANT , BORDER_REFLECT and BORDER_WRAP are supporte
    fastNlMeansDenoising
  */
 CV_EXPORTS void nonLocalMeans(InputArray src, OutputArray dst,
-                              float h,
-                              int search_window = 21,
-                              int block_size = 7,
-                              int borderMode = BORDER_DEFAULT,
-                              Stream& stream = Stream::Null());
+                            float h,
+                            int search_window = 21,
+                            int block_size = 7,
+                            int borderMode = BORDER_DEFAULT,
+                            Stream& stream = Stream::Null());
+CV_WRAP inline void nonLocalMeans(const GpuMat& src, CV_OUT GpuMat& dst,
+                            float h,
+                            int search_window = 21,
+                            int block_size = 7,
+                            int borderMode = BORDER_DEFAULT,
+                            Stream& stream = Stream::Null())
+{
+    nonLocalMeans(InputArray(src), OutputArray(dst), h, search_window, block_size, borderMode, stream);
+}
 
 /** @brief Perform image denoising using Non-local Means Denoising algorithm
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising> with several computational
@@ -93,10 +102,18 @@ FastNonLocalMeansDenoising::labMethod.
    fastNlMeansDenoising
  */
 CV_EXPORTS void fastNlMeansDenoising(InputArray src, OutputArray dst,
-                                     float h,
-                                     int search_window = 21,
-                                     int block_size = 7,
-                                     Stream& stream = Stream::Null());
+                                    float h,
+                                    int search_window = 21,
+                                    int block_size = 7,
+                                    Stream& stream = Stream::Null());
+CV_WRAP inline void fastNlMeansDenoising(const GpuMat& src, CV_OUT GpuMat& dst,
+                                    float h,
+                                    int search_window = 21,
+                                    int block_size = 7,
+                                    Stream& stream = Stream::Null())
+{
+    fastNlMeansDenoising(InputArray(src), OutputArray(dst), h, search_window, block_size, stream);
+}
 
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
@@ -124,6 +141,14 @@ CV_EXPORTS void fastNlMeansDenoisingColored(InputArray src, OutputArray dst,
                                             int search_window = 21,
                                             int block_size = 7,
                                             Stream& stream = Stream::Null());
+CV_WRAP inline void fastNlMeansDenoisingColored(const GpuMat& src, CV_OUT GpuMat& dst,
+                                            float h_luminance, float photo_render,
+                                            int search_window = 21,
+                                            int block_size = 7,
+                                            Stream& stream = Stream::Null())
+{
+    fastNlMeansDenoisingColored(InputArray(src), OutputArray(dst), h_luminance, photo_render, search_window, block_size, stream);
+}
 
 //! @} photo
 
diff --git a/3rdParty/opencv2/stitching.hpp b/3rdParty/opencv2/stitching.hpp
index 13622401ce..b38210d9d4 100644
--- a/3rdParty/opencv2/stitching.hpp
+++ b/3rdParty/opencv2/stitching.hpp
@@ -299,9 +299,22 @@ class CV_EXPORTS_W Stitcher
      */
     CV_WRAP Status stitch(InputArrayOfArrays images, InputArrayOfArrays masks, OutputArray pano);
 
-    std::vector<int> component() const { return indices_; }
-    std::vector<detail::CameraParams> cameras() const { return cameras_; }
+    /** @brief Returns indeces of input images used in panorama stitching
+     */
+    CV_WRAP std::vector<int> component() const { return indices_; }
+
+    /** Returns estimated camera parameters for all stitched images
+     */
+    CV_WRAP std::vector<cv::detail::CameraParams> cameras() const { return cameras_; }
     CV_WRAP double workScale() const { return work_scale_; }
+
+    /** @brief Return the mask of the panorama.
+
+    The mask is a 8U UMat with the values: 0xFF (white) for pixels filled by the input images,
+    0 (black) for unused pixels. It can be used as the mask for inpaint.
+
+    @return The mask.
+     */
     UMat resultMask() const { return result_mask_; }
 
 private:
diff --git a/3rdParty/opencv2/stitching/detail/exposure_compensate.hpp b/3rdParty/opencv2/stitching/detail/exposure_compensate.hpp
index 0111e7980a..fb4eb71eb4 100644
--- a/3rdParty/opencv2/stitching/detail/exposure_compensate.hpp
+++ b/3rdParty/opencv2/stitching/detail/exposure_compensate.hpp
@@ -85,10 +85,10 @@ class CV_EXPORTS_W ExposureCompensator
     @param mask Image mask
         */
     CV_WRAP virtual void apply(int index, Point corner, InputOutputArray image, InputArray mask) = 0;
-    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");};
-    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); };
-    CV_WRAP void setUpdateGain(bool b) { updateGain = b; };
-    CV_WRAP bool getUpdateGain() { return updateGain; };
+    CV_WRAP virtual void getMatGains(CV_OUT std::vector<Mat>& ) {CV_Error(Error::StsInternal, "");}
+    CV_WRAP virtual void setMatGains(std::vector<Mat>& ) { CV_Error(Error::StsInternal, ""); }
+    CV_WRAP void setUpdateGain(bool b) { updateGain = b; }
+    CV_WRAP bool getUpdateGain() { return updateGain; }
 protected :
     bool updateGain;
 };
@@ -101,8 +101,8 @@ class CV_EXPORTS_W NoExposureCompensator : public ExposureCompensator
     void feed(const std::vector<Point> &/*corners*/, const std::vector<UMat> &/*images*/,
               const std::vector<std::pair<UMat,uchar> > &/*masks*/) CV_OVERRIDE { }
     CV_WRAP void apply(int /*index*/, Point /*corner*/, InputOutputArray /*image*/, InputArray /*mask*/) CV_OVERRIDE { }
-    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
-    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; };
+    CV_WRAP void getMatGains(CV_OUT std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
+    CV_WRAP void setMatGains(std::vector<Mat>& umv) CV_OVERRIDE { umv.clear(); return; }
 };
 
 /** @brief Exposure compensator which tries to remove exposure related artifacts by adjusting image
diff --git a/3rdParty/opencv2/stitching/detail/matchers.hpp b/3rdParty/opencv2/stitching/detail/matchers.hpp
index 32fdb490a8..44a623ae6c 100644
--- a/3rdParty/opencv2/stitching/detail/matchers.hpp
+++ b/3rdParty/opencv2/stitching/detail/matchers.hpp
@@ -61,7 +61,7 @@ struct CV_EXPORTS_W_SIMPLE ImageFeatures
     CV_PROP_RW Size img_size;
     CV_PROP_RW std::vector<KeyPoint> keypoints;
     CV_PROP_RW UMat descriptors;
-    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; };
+    CV_WRAP std::vector<KeyPoint> getKeypoints() { return keypoints; }
 };
 /** @brief
 
@@ -109,8 +109,8 @@ struct CV_EXPORTS_W_SIMPLE MatchesInfo
     CV_PROP_RW int num_inliers;                    //!< Number of geometrically consistent matches
     CV_PROP_RW Mat H;                              //!< Estimated transformation
     CV_PROP_RW double confidence;                  //!< Confidence two images are from the same panorama
-    CV_WRAP std::vector<DMatch> getMatches() { return matches; };
-    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; };
+    CV_WRAP std::vector<DMatch> getMatches() { return matches; }
+    CV_WRAP std::vector<uchar> getInliers() { return inliers_mask; }
 };
 
 /** @brief Feature matchers base class. */
@@ -138,7 +138,7 @@ class CV_EXPORTS_W FeaturesMatcher
     @sa detail::MatchesInfo
     */
     CV_WRAP_AS(apply2) void operator ()(const std::vector<ImageFeatures> &features, CV_OUT std::vector<MatchesInfo> &pairwise_matches,
-                     const cv::UMat &mask = cv::UMat());
+                                        const cv::UMat &mask = cv::UMat()) { match(features, pairwise_matches, mask); }
 
     /** @return True, if it's possible to use the same matcher instance in parallel, false otherwise
     */
@@ -161,6 +161,16 @@ class CV_EXPORTS_W FeaturesMatcher
     virtual void match(const ImageFeatures &features1, const ImageFeatures &features2,
                        MatchesInfo& matches_info) = 0;
 
+    /** @brief This method implements logic to match features between arbitrary number of features.
+    By default this checks every pair of inputs in the input, but the behaviour can be changed by subclasses.
+
+    @param features vector of image features
+    @param pairwise_matches found matches
+    @param mask (optional) mask indicating which image pairs should be matched
+     */
+    virtual void match(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
+                       const cv::UMat &mask = cv::UMat());
+
     bool is_thread_safe_;
 };
 
@@ -180,19 +190,22 @@ class CV_EXPORTS_W BestOf2NearestMatcher : public FeaturesMatcher
     estimation used in the inliers classification step
     @param num_matches_thresh2 Minimum number of matches required for the 2D projective transform
     re-estimation on inliers
+    @param matches_confindece_thresh Matching confidence threshold to take the match into account.
+    The threshold was determined experimentally and set to 3 by default.
      */
     CV_WRAP BestOf2NearestMatcher(bool try_use_gpu = false, float match_conf = 0.3f, int num_matches_thresh1 = 6,
-                          int num_matches_thresh2 = 6);
+                          int num_matches_thresh2 = 6, double matches_confindece_thresh = 3.);
 
     CV_WRAP void collectGarbage() CV_OVERRIDE;
     CV_WRAP static Ptr<BestOf2NearestMatcher> create(bool try_use_gpu = false, float match_conf = 0.3f, int num_matches_thresh1 = 6,
-        int num_matches_thresh2 = 6);
+        int num_matches_thresh2 = 6, double matches_confindece_thresh = 3.);
 
 protected:
 
     void match(const ImageFeatures &features1, const ImageFeatures &features2, MatchesInfo &matches_info) CV_OVERRIDE;
     int num_matches_thresh1_;
     int num_matches_thresh2_;
+    double matches_confindece_thresh_;
     Ptr<FeaturesMatcher> impl_;
 };
 
@@ -202,11 +215,12 @@ class CV_EXPORTS_W BestOf2NearestRangeMatcher : public BestOf2NearestMatcher
     CV_WRAP BestOf2NearestRangeMatcher(int range_width = 5, bool try_use_gpu = false, float match_conf = 0.3f,
                             int num_matches_thresh1 = 6, int num_matches_thresh2 = 6);
 
-    void operator ()(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
-                     const cv::UMat &mask = cv::UMat());
-
-
 protected:
+    // indicate that we do not want to hide the base class match method with a different signature
+    using BestOf2NearestMatcher::match;
+    void match(const std::vector<ImageFeatures> &features, std::vector<MatchesInfo> &pairwise_matches,
+               const cv::UMat &mask = cv::UMat()) CV_OVERRIDE;
+
     int range_width_;
 };
 
diff --git a/3rdParty/opencv2/stitching/detail/motion_estimators.hpp b/3rdParty/opencv2/stitching/detail/motion_estimators.hpp
index 07bd032c6f..00c61343a2 100644
--- a/3rdParty/opencv2/stitching/detail/motion_estimators.hpp
+++ b/3rdParty/opencv2/stitching/detail/motion_estimators.hpp
@@ -353,8 +353,8 @@ void CV_EXPORTS_W waveCorrect(CV_IN_OUT std::vector<Mat> &rmats, WaveCorrectKind
 // Auxiliary functions
 
 // Returns matches graph representation in DOT language
-String CV_EXPORTS_W matchesGraphAsString(std::vector<String> &pathes, std::vector<MatchesInfo> &pairwise_matches,
-                                            float conf_threshold);
+String CV_EXPORTS_W matchesGraphAsString(std::vector<String> &paths, std::vector<MatchesInfo> &pairwise_matches,
+                                         float conf_threshold);
 
 CV_EXPORTS_W std::vector<int>  leaveBiggestComponent(
         std::vector<ImageFeatures> &features,
diff --git a/3rdParty/opencv2/stitching/detail/seam_finders.hpp b/3rdParty/opencv2/stitching/detail/seam_finders.hpp
index 1fa8324ce4..1c13cce101 100644
--- a/3rdParty/opencv2/stitching/detail/seam_finders.hpp
+++ b/3rdParty/opencv2/stitching/detail/seam_finders.hpp
@@ -248,7 +248,7 @@ class CV_EXPORTS_W GraphCutSeamFinder : public GraphCutSeamFinderBase, public Se
     ~GraphCutSeamFinder();
 
     CV_WRAP void find(const std::vector<UMat> &src, const std::vector<Point> &corners,
-              std::vector<UMat> &masks) CV_OVERRIDE;
+                      CV_IN_OUT std::vector<UMat> &masks) CV_OVERRIDE;
 
 private:
     // To avoid GCGraph dependency
diff --git a/3rdParty/opencv2/stitching/detail/warpers.hpp b/3rdParty/opencv2/stitching/detail/warpers.hpp
index c7e5bb7b28..7dd16871e7 100644
--- a/3rdParty/opencv2/stitching/detail/warpers.hpp
+++ b/3rdParty/opencv2/stitching/detail/warpers.hpp
@@ -503,6 +503,11 @@ class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
 public:
     PlaneWarperGpu(float scale = 1.f) : PlaneWarper(scale) {}
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
     Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
@@ -536,6 +541,9 @@ class CV_EXPORTS PlaneWarperGpu : public PlaneWarper
         d_dst_.download(dst);
         return result;
     }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
     Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
@@ -557,6 +565,11 @@ class CV_EXPORTS SphericalWarperGpu : public SphericalWarper
 public:
     SphericalWarperGpu(float scale) : SphericalWarper(scale) {}
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
     Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
@@ -573,6 +586,9 @@ class CV_EXPORTS SphericalWarperGpu : public SphericalWarper
         d_dst_.download(dst);
         return result;
     }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
     Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
@@ -589,6 +605,11 @@ class CV_EXPORTS CylindricalWarperGpu : public CylindricalWarper
 public:
     CylindricalWarperGpu(float scale) : CylindricalWarper(scale) {}
 
+// WARNING: unreachable code using Ninja
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(push)
+#pragma warning(disable: 4702)
+#endif
     Rect buildMaps(Size src_size, InputArray K, InputArray R, OutputArray xmap, OutputArray ymap) CV_OVERRIDE
     {
         Rect result = buildMaps(src_size, K, R, d_xmap_, d_ymap_);
@@ -605,6 +626,9 @@ class CV_EXPORTS CylindricalWarperGpu : public CylindricalWarper
         d_dst_.download(dst);
         return result;
     }
+#if defined _MSC_VER && _MSC_VER >= 1920
+#pragma warning(pop)
+#endif
 
     Rect buildMaps(Size src_size, InputArray K, InputArray R, cuda::GpuMat & xmap, cuda::GpuMat & ymap);
 
diff --git a/3rdParty/opencv2/stitching/warpers.hpp b/3rdParty/opencv2/stitching/warpers.hpp
index 2c1d90bf3f..34ef44e8f1 100644
--- a/3rdParty/opencv2/stitching/warpers.hpp
+++ b/3rdParty/opencv2/stitching/warpers.hpp
@@ -53,7 +53,7 @@ namespace cv {
 
     public:
         CV_WRAP PyRotationWarper(String type, float scale);
-        CV_WRAP PyRotationWarper() {};
+        CV_WRAP PyRotationWarper() {}
         ~PyRotationWarper() {}
 
         /** @brief Projects the image point.
diff --git a/3rdParty/opencv2/video.hpp b/3rdParty/opencv2/video.hpp
index dce51f6a23..8ce49bc284 100644
--- a/3rdParty/opencv2/video.hpp
+++ b/3rdParty/opencv2/video.hpp
@@ -49,7 +49,6 @@
   @{
     @defgroup video_motion Motion Analysis
     @defgroup video_track Object Tracking
-    @defgroup video_c C API
   @}
 */
 
diff --git a/3rdParty/opencv2/video/detail/tracking.detail.hpp b/3rdParty/opencv2/video/detail/tracking.detail.hpp
index 9883a061c3..eb954d6a16 100644
--- a/3rdParty/opencv2/video/detail/tracking.detail.hpp
+++ b/3rdParty/opencv2/video/detail/tracking.detail.hpp
@@ -171,7 +171,7 @@ width, height, orientation, etc.
 class CV_EXPORTS TrackerTargetState
 {
 public:
-    virtual ~TrackerTargetState() {};
+    virtual ~TrackerTargetState() {}
     /** @brief Get the position
     * @return The position
     */
diff --git a/3rdParty/opencv2/video/tracking.hpp b/3rdParty/opencv2/video/tracking.hpp
index 8363f44ec7..130bf1bcc0 100644
--- a/3rdParty/opencv2/video/tracking.hpp
+++ b/3rdParty/opencv2/video/tracking.hpp
@@ -166,7 +166,7 @@ performance boost.
 The function implements a sparse iterative version of the Lucas-Kanade optical flow in pyramids. See
 @cite Bouguet00 . The function is parallelized with the TBB library.
 
-@note
+@note Some examples:
 
 -   An example using the Lucas-Kanade optical flow algorithm can be found at
     opencv_source_code/samples/cpp/lkdemo.cpp
@@ -213,7 +213,7 @@ The function finds an optical flow for each prev pixel using the @cite Farneback
 
 \f[\texttt{prev} (y,x)  \sim \texttt{next} ( y + \texttt{flow} (y,x)[1],  x + \texttt{flow} (y,x)[0])\f]
 
-@note
+@note Some examples:
 
 -   An example using the optical flow algorithm described by Gunnar Farneback can be found at
     opencv_source_code/samples/cpp/fback.cpp
@@ -564,6 +564,12 @@ class CV_EXPORTS_W VariationalRefinement : public DenseOpticalFlow
     /** @copybrief getGamma @see getGamma */
     CV_WRAP virtual void setGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setEpsilon */
+    CV_WRAP virtual float getEpsilon() const = 0;
+    /** @copybrief getEpsilon @see getEpsilon */
+    CV_WRAP virtual void setEpsilon(float val) = 0;
+
     /** @brief Creates an instance of VariationalRefinement
     */
     CV_WRAP static Ptr<VariationalRefinement> create();
@@ -645,6 +651,12 @@ class CV_EXPORTS_W DISOpticalFlow : public DenseOpticalFlow
     /** @copybrief getVariationalRefinementGamma @see getVariationalRefinementGamma */
     CV_WRAP virtual void setVariationalRefinementGamma(float val) = 0;
 
+    /** @brief Norm value shift for robust penalizer
+    @see setVariationalRefinementEpsilon */
+    CV_WRAP virtual float getVariationalRefinementEpsilon() const = 0;
+    /** @copybrief getVariationalRefinementEpsilon @see getVariationalRefinementEpsilon */
+    CV_WRAP virtual void setVariationalRefinementEpsilon(float val) = 0;
+
 
     /** @brief Whether to use mean-normalization of patches when computing patch distance. It is turned on
         by default as it typically provides a noticeable quality boost because of increased robustness to
@@ -849,6 +861,81 @@ class CV_EXPORTS_W TrackerDaSiamRPN : public Tracker
     //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
 };
 
+/** @brief the Nano tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  Nano tracker is much faster and extremely lightweight due to special model structure, the whole model size is about 1.9 MB.
+ *  Nano tracker needs two models: one for feature extraction (backbone) and the another for localization (neckhead).
+ *  Model download link: https://github.com/HonglinChu/SiamTrackers/tree/master/NanoTrack/models/nanotrackv2
+ *  Original repo is here: https://github.com/HonglinChu/NanoTrack
+ *  Author: HongLinChu, 1628464345@qq.com
+ */
+class CV_EXPORTS_W TrackerNano : public Tracker
+{
+protected:
+    TrackerNano();  // use ::create()
+public:
+    virtual ~TrackerNano() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string backbone;
+        CV_PROP_RW std::string neckhead;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+    };
+
+    /** @brief Constructor
+    @param parameters NanoTrack parameters TrackerNano::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerNano> create(const TrackerNano::Params& parameters = TrackerNano::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    //void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    //bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
+
+/** @brief the VIT tracker is a super lightweight dnn-based general object tracking.
+ *
+ *  VIT tracker is much faster and extremely lightweight due to special model structure, the model file is about 767KB.
+ *  Model download link: https://github.com/opencv/opencv_zoo/tree/main/models/object_tracking_vittrack
+ *  Author: PengyuLiu, 1872918507@qq.com
+ */
+class CV_EXPORTS_W TrackerVit : public Tracker
+{
+protected:
+    TrackerVit();  // use ::create()
+public:
+    virtual ~TrackerVit() CV_OVERRIDE;
+
+    struct CV_EXPORTS_W_SIMPLE Params
+    {
+        CV_WRAP Params();
+        CV_PROP_RW std::string net;
+        CV_PROP_RW int backend;
+        CV_PROP_RW int target;
+        CV_PROP_RW Scalar meanvalue;
+        CV_PROP_RW Scalar stdvalue;
+        CV_PROP_RW float tracking_score_threshold;
+    };
+
+    /** @brief Constructor
+    @param parameters vit tracker parameters TrackerVit::Params
+    */
+    static CV_WRAP
+    Ptr<TrackerVit> create(const TrackerVit::Params& parameters = TrackerVit::Params());
+
+    /** @brief Return tracking score
+    */
+    CV_WRAP virtual float getTrackingScore() = 0;
+
+    // void init(InputArray image, const Rect& boundingBox) CV_OVERRIDE;
+    // bool update(InputArray image, CV_OUT Rect& boundingBox) CV_OVERRIDE;
+};
 
 //! @} video_track
 
diff --git a/3rdParty/opencv2/videoio.hpp b/3rdParty/opencv2/videoio.hpp
index e57ee7c285..9fc080c4cf 100644
--- a/3rdParty/opencv2/videoio.hpp
+++ b/3rdParty/opencv2/videoio.hpp
@@ -84,8 +84,13 @@ namespace cv
 Select preferred API for a capture object.
 To be used in the VideoCapture::VideoCapture() constructor or VideoCapture::open()
 
-@note Backends are available only if they have been built with your OpenCV binaries.
+@note
+-   Backends are available only if they have been built with your OpenCV binaries.
 See @ref videoio_overview for more information.
+-   Microsoft Media Foundation backend tries to use hardware accelerated transformations
+if possible. Environment flag "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" set to 0
+disables it and may improve initialization time. More details:
+https://learn.microsoft.com/en-us/windows/win32/medfound/mf-readwrite-enable-hardware-transforms
 */
 enum VideoCaptureAPIs {
        CAP_ANY          = 0,            //!< Auto detect == 0
@@ -103,11 +108,11 @@ enum VideoCaptureAPIs {
        CAP_PVAPI        = 800,          //!< PvAPI, Prosilica GigE SDK
        CAP_OPENNI       = 900,          //!< OpenNI (for Kinect)
        CAP_OPENNI_ASUS  = 910,          //!< OpenNI (for Asus Xtion)
-       CAP_ANDROID      = 1000,         //!< Android - not used
+       CAP_ANDROID      = 1000,         //!< MediaNDK (API Level 21+) and NDK Camera (API level 24+) for Android
        CAP_XIAPI        = 1100,         //!< XIMEA Camera API
        CAP_AVFOUNDATION = 1200,         //!< AVFoundation framework for iOS (OS X Lion will have the same API)
        CAP_GIGANETIX    = 1300,         //!< Smartek Giganetix GigEVisionSDK
-       CAP_MSMF         = 1400,         //!< Microsoft Media Foundation (via videoInput)
+       CAP_MSMF         = 1400,         //!< Microsoft Media Foundation (via videoInput). See platform specific notes above.
        CAP_WINRT        = 1410,         //!< Microsoft Windows Runtime using Media Foundation
        CAP_INTELPERC    = 1500,         //!< RealSense (former Intel Perceptual Computing SDK)
        CAP_REALSENSE    = 1500,         //!< Synonym for CAP_INTELPERC
@@ -123,8 +128,10 @@ enum VideoCaptureAPIs {
        CAP_INTEL_MFX    = 2300,         //!< Intel MediaSDK
        CAP_XINE         = 2400,         //!< XINE engine (Linux)
        CAP_UEYE         = 2500,         //!< uEye Camera API
+       CAP_OBSENSOR     = 2600,         //!< For Orbbec 3D-Sensor device/module (Astra+, Femto, Astra2, Gemini2, Gemini2L, Gemini2XL, Femto Mega) attention: Astra2 cameras currently only support Windows and Linux kernel versions no higher than 4.15, and higher versions of Linux kernel may have exceptions.
      };
 
+
 /** @brief cv::VideoCapture generic properties identifier.
 
  Reading / writing properties involves many layers. Some unexpected result might happens along this chain.
@@ -133,7 +140,7 @@ enum VideoCaptureAPIs {
 */
 enum VideoCaptureProperties {
        CAP_PROP_POS_MSEC       =0, //!< Current position of the video file in milliseconds.
-       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next.
+       CAP_PROP_POS_FRAMES     =1, //!< 0-based index of the frame to be decoded/captured next. When the index i is set in RAW mode (CAP_PROP_FORMAT == -1) this will seek to the key frame k, where k <= i.
        CAP_PROP_POS_AVI_RATIO  =2, //!< Relative position of the video file: 0=start of the film, 1=end of the film.
        CAP_PROP_FRAME_WIDTH    =3, //!< Width of the frames in the video stream.
        CAP_PROP_FRAME_HEIGHT   =4, //!< Height of the frames in the video stream.
@@ -181,14 +188,14 @@ enum VideoCaptureProperties {
        CAP_PROP_WB_TEMPERATURE=45, //!< white-balance color temperature
        CAP_PROP_CODEC_PIXEL_FORMAT =46,    //!< (read-only) codec's pixel format. 4-character code - see VideoWriter::fourcc . Subset of [AV_PIX_FMT_*](https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/raw.c) or -1 if unknown
        CAP_PROP_BITRATE       =47, //!< (read-only) Video bitrate in kbits/s
-       CAP_PROP_ORIENTATION_META=48, //!< (read-only) Frame rotation defined by stream meta (applicable for FFmpeg back-end only)
-       CAP_PROP_ORIENTATION_AUTO=49, //!< if true - rotates output frames of CvCapture considering video file's metadata  (applicable for FFmpeg back-end only) (https://github.com/opencv/opencv/issues/15499)
+       CAP_PROP_ORIENTATION_META=48, //!< (read-only) Frame rotation defined by stream meta (applicable for FFmpeg and AVFoundation back-ends only)
+       CAP_PROP_ORIENTATION_AUTO=49, //!< if true - rotates output frames of CvCapture considering video file's metadata  (applicable for FFmpeg and AVFoundation back-ends only) (https://github.com/opencv/opencv/issues/15499)
        CAP_PROP_HW_ACCELERATION=50, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in cv::VideoCapture constructor / .open() method. Default value is backend-specific.
        CAP_PROP_HW_DEVICE      =51, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
        CAP_PROP_HW_ACCELERATION_USE_OPENCL=52, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between HW accelerated decoder and cv::UMat.
-       CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg back-end only)
-       CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg back-end only)
-       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //<! (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
+       CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg and GStreamer back-ends only)
+       CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg and GStreamer back-ends only)
+       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //!< (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
        CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
        CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
        CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
@@ -202,6 +209,10 @@ enum VideoCaptureProperties {
        CAP_PROP_AUDIO_SYNCHRONIZE = 66, //!< (open, read) Enables audio synchronization.
        CAP_PROP_LRF_HAS_KEY_FRAME = 67, //!< FFmpeg back-end only - Indicates whether the Last Raw Frame (LRF), output from VideoCapture::read() when VideoCapture is initialized with VideoCapture::open(CAP_FFMPEG, {CAP_PROP_FORMAT, -1}) or VideoCapture::set(CAP_PROP_FORMAT,-1) is called before the first call to VideoCapture::read(), contains encoded data for a key frame.
        CAP_PROP_CODEC_EXTRADATA_INDEX = 68, //!< Positive index indicates that returning extra data is supported by the video back end.  This can be retrieved as cap.retrieve(data, <returned index>).  E.g. When reading from a h264 encoded RTSP stream, the FFmpeg backend could return the SPS and/or PPS if available (if sent in reply to a DESCRIBE request), from calls to cap.retrieve(data, <returned index>).
+       CAP_PROP_FRAME_TYPE = 69, //!< (read-only) FFmpeg back-end only - Frame type ascii code (73 = 'I', 80 = 'P', 66 = 'B' or 63 = '?' if unknown) of the most recently read frame.
+       CAP_PROP_N_THREADS = 70, //!< (**open-only**) Set the maximum number of threads to use. Use 0 to use as many threads as CPU cores (applicable for FFmpeg back-end only).
+       CAP_PROP_PTS = 71, //!<  (read-only) FFmpeg back-end only - presentation timestamp of the most recently read frame using the FPS time base.  e.g. fps = 25, VideoCapture::get(\ref CAP_PROP_PTS) = 3, presentation time = 3/25 seconds.
+       CAP_PROP_DTS_DELAY = 72, //!<  (read-only) FFmpeg back-end only - maximum difference between presentation (pts) and decompression timestamps (dts) using FPS time base.  e.g. delay is maximum when frame_num = 0, if true, VideoCapture::get(\ref CAP_PROP_PTS) = 0 and VideoCapture::get(\ref CAP_PROP_DTS_DELAY) = 2, dts = -2.  Non zero values usually imply the stream is encoded using B-frames which are not decoded in presentation order.
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
@@ -216,10 +227,15 @@ enum VideoWriterProperties {
   VIDEOWRITER_PROP_NSTRIPES = 3,   //!< Number of stripes for parallel encoding. -1 for auto detection.
   VIDEOWRITER_PROP_IS_COLOR = 4,   //!< If it is not zero, the encoder will expect and encode color frames, otherwise it
                                    //!< will work with grayscale frames.
-  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to CV_8U.
+  VIDEOWRITER_PROP_DEPTH = 5,      //!< Defaults to \ref CV_8U.
   VIDEOWRITER_PROP_HW_ACCELERATION = 6, //!< (**open-only**) Hardware acceleration type (see #VideoAccelerationType). Setting supported only via `params` parameter in VideoWriter constructor / .open() method. Default value is backend-specific.
   VIDEOWRITER_PROP_HW_DEVICE       = 7, //!< (**open-only**) Hardware device index (select GPU if multiple available). Device enumeration is acceleration type specific.
   VIDEOWRITER_PROP_HW_ACCELERATION_USE_OPENCL= 8, //!< (**open-only**) If non-zero, create new OpenCL context and bind it to current thread. The OpenCL context created with Video Acceleration context attached it (if not attached yet) for optimized GPU data copy between cv::UMat and HW accelerated encoder.
+  VIDEOWRITER_PROP_RAW_VIDEO = 9, //!< (**open-only**) Set to non-zero to enable encapsulation of an encoded raw video stream. Each raw encoded video frame should be passed to VideoWriter::write() as single row or column of a \ref CV_8UC1 Mat. \note If the key frame interval is not 1 then it must be manually specified by the user. This can either be performed during initialization passing \ref VIDEOWRITER_PROP_KEY_INTERVAL as one of the extra encoder params  to \ref VideoWriter::VideoWriter(const String &, int, double, const Size &, const std::vector< int > &params) or afterwards by setting the \ref VIDEOWRITER_PROP_KEY_FLAG with \ref VideoWriter::set() before writing each frame. FFMpeg backend only.
+  VIDEOWRITER_PROP_KEY_INTERVAL = 10, //!< (**open-only**) Set the key frame interval using raw video encapsulation (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). Defaults to 1 when not set. FFmpeg back-end only.
+  VIDEOWRITER_PROP_KEY_FLAG = 11, //!< Set to non-zero to signal that the following frames are key frames or zero if not, when encapsulating raw video (\ref VIDEOWRITER_PROP_RAW_VIDEO != 0). FFmpeg back-end only.
+  VIDEOWRITER_PROP_PTS = 12, //!< Specifies the frame presentation timestamp for each frame using the FPS time base. This property is **only** necessary when encapsulating **externally** encoded video where the decoding order differs from the presentation order, such as in GOP patterns with bi-directional B-frames. The value should be provided by your external encoder and for video sources with fixed frame rates it is equivalent to dividing the current frame's presentation time (\ref CAP_PROP_POS_MSEC) by the frame duration (1000.0 / VideoCapture::get(\ref CAP_PROP_FPS)). It can be queried from the resulting encapsulated video file using VideoCapture::get(\ref CAP_PROP_PTS). FFmpeg back-end only.
+  VIDEOWRITER_PROP_DTS_DELAY = 13, //!< Specifies the maximum difference between presentation (pts) and decompression timestamps (dts) using the FPS time base. This property is necessary **only** when encapsulating **externally** encoded video where the decoding order differs from the presentation order, such as in GOP patterns with bi-directional B-frames. The value should be calculated based on the specific GOP pattern used during encoding. For example, in a GOP with presentation order IBP and decoding order IPB, this value would be 1, as the B-frame is the second frame presented but the third to be decoded. It can be queried from the resulting encapsulated video file using VideoCapture::get(\ref CAP_PROP_DTS_DELAY). Non-zero values usually imply the stream is encoded using B-frames. FFmpeg back-end only.
 #ifndef CV_DOXYGEN
   CV__VIDEOWRITER_PROP_LATEST
 #endif
@@ -302,16 +318,23 @@ enum { CAP_PROP_OPENNI_OUTPUT_MODE       = 100,
        CAP_PROP_OPENNI2_MIRROR           = 111
      };
 
+#ifdef _MSC_VER
+#pragma warning( push )
+#pragma warning( disable: 5054 )
+#endif
 //! OpenNI shortcuts
-enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
-       CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_OUTPUT_MODE,
-       CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
-       CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_BASELINE,
-       CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_FOCAL_LENGTH,
-       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_REGISTRATION,
-       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON = CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
-       CAP_OPENNI_IR_GENERATOR_PRESENT            = CAP_OPENNI_IR_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+enum { CAP_OPENNI_IMAGE_GENERATOR_PRESENT         = +CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+       CAP_OPENNI_IMAGE_GENERATOR_OUTPUT_MODE     = +CAP_OPENNI_IMAGE_GENERATOR + CAP_PROP_OPENNI_OUTPUT_MODE,
+       CAP_OPENNI_DEPTH_GENERATOR_PRESENT         = +CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT,
+       CAP_OPENNI_DEPTH_GENERATOR_BASELINE        = +CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_BASELINE,
+       CAP_OPENNI_DEPTH_GENERATOR_FOCAL_LENGTH    = +CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_FOCAL_LENGTH,
+       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION    = +CAP_OPENNI_DEPTH_GENERATOR + CAP_PROP_OPENNI_REGISTRATION,
+       CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION_ON =  CAP_OPENNI_DEPTH_GENERATOR_REGISTRATION,
+       CAP_OPENNI_IR_GENERATOR_PRESENT            = +CAP_OPENNI_IR_GENERATOR + CAP_PROP_OPENNI_GENERATOR_PRESENT
      };
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
 
 //! OpenNI data given from depth generator
 enum { CAP_OPENNI_DEPTH_MAP         = 0, //!< Depth values in mm (CV_16UC1)
@@ -557,6 +580,18 @@ enum { CAP_PROP_ARAVIS_AUTOTRIGGER                              = 600 //!< Autom
 
 //! @} ARAVIS
 
+
+/** @name Android
+    @{
+*/
+
+//! Properties of cameras available through NDK Camera API backend
+enum { CAP_PROP_ANDROID_DEVICE_TORCH = 8001,
+     };
+
+//! @} Android
+
+
 /** @name AVFoundation framework for iOS
     @{
 */
@@ -653,8 +688,56 @@ enum { CAP_PROP_IMAGES_BASE = 18000,
 
 //! @} Images
 
+/** @name OBSENSOR (for Orbbec 3D-Sensor device/module )
+    @{
+*/
+//! OBSENSOR data given from image generator
+enum VideoCaptureOBSensorDataType{
+    CAP_OBSENSOR_DEPTH_MAP = 0, //!< Depth values in mm (CV_16UC1)
+    CAP_OBSENSOR_BGR_IMAGE = 1, //!< Data given from BGR stream generator
+    CAP_OBSENSOR_IR_IMAGE = 2   //!< Data given from IR stream generator(CV_16UC1)
+};
+
+//! OBSENSOR stream generator
+enum VideoCaptureOBSensorGenerators{
+    CAP_OBSENSOR_DEPTH_GENERATOR = 1 << 29,
+    CAP_OBSENSOR_IMAGE_GENERATOR = 1 << 28,
+    CAP_OBSENSOR_IR_GENERATOR    = 1 << 27,
+    CAP_OBSENSOR_GENERATORS_MASK = CAP_OBSENSOR_DEPTH_GENERATOR + CAP_OBSENSOR_IMAGE_GENERATOR + CAP_OBSENSOR_IR_GENERATOR
+};
+
+//!OBSENSOR properties
+enum VideoCaptureOBSensorProperties{
+    // INTRINSIC
+    CAP_PROP_OBSENSOR_INTRINSIC_FX=26001,
+    CAP_PROP_OBSENSOR_INTRINSIC_FY=26002,
+    CAP_PROP_OBSENSOR_INTRINSIC_CX=26003,
+    CAP_PROP_OBSENSOR_INTRINSIC_CY=26004,
+};
+
+//! @} OBSENSOR
+
 //! @} videoio_flags_others
 
+/** @brief Read data stream interface
+ */
+class CV_EXPORTS_W IStreamReader
+{
+public:
+    virtual ~IStreamReader();
+
+    /** @brief Read bytes from stream */
+    virtual long long read(char* buffer, long long size) = 0;
+
+    /** @brief Sets the stream position
+     *
+     * @param offset Seek offset
+     * @param origin SEEK_SET / SEEK_END / SEEK_CUR
+     *
+     * @see fseek
+     */
+    virtual long long seek(long long offset, int origin) = 0;
+};
 
 class IVideoCapture;
 //! @cond IGNORED
@@ -734,6 +817,14 @@ class CV_EXPORTS_W VideoCapture
     */
     CV_WRAP explicit VideoCapture(int index, int apiPreference, const std::vector<int>& params);
 
+    /** @overload
+    @brief Opens a video using data stream.
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+    */
+    CV_WRAP VideoCapture(const Ptr<IStreamReader>& source, int apiPreference, const std::vector<int>& params);
+
     /** @brief Default destructor
 
     The method first calls VideoCapture::release to close the already opened file or camera.
@@ -788,6 +879,19 @@ class CV_EXPORTS_W VideoCapture
     */
     CV_WRAP virtual bool open(int index, int apiPreference, const std::vector<int>& params);
 
+    /** @brief Opens a video using data stream.
+
+    @overload
+
+    The `params` parameter allows to specify extra parameters encoded as pairs `(paramId_1, paramValue_1, paramId_2, paramValue_2, ...)`.
+    See cv::VideoCaptureProperties
+
+    @return `true` if the file has been successfully opened
+
+    The method first calls VideoCapture::release to close the already opened file or camera.
+     */
+    CV_WRAP virtual bool open(const Ptr<IStreamReader>& source, int apiPreference, const std::vector<int>& params);
+
     /** @brief Returns true if video capturing has been initialized already.
 
     If the previous call to VideoCapture constructor or VideoCapture::open() succeeded, the method returns
@@ -912,7 +1016,7 @@ class CV_EXPORTS_W VideoCapture
     CV_WRAP void setExceptionMode(bool enable) { throwOnFail = enable; }
 
     /// query if exception mode is active
-    CV_WRAP bool getExceptionMode() { return throwOnFail; }
+    CV_WRAP bool getExceptionMode() const { return throwOnFail; }
 
 
     /** @brief Wait for ready frames from VideoCapture.
@@ -929,7 +1033,7 @@ class CV_EXPORTS_W VideoCapture
 
     After this call use VideoCapture::retrieve() to decode and fetch frame data.
     */
-    static /*CV_WRAP*/
+    CV_WRAP static
     bool waitAny(
             const std::vector<VideoCapture>& streams,
             CV_OUT std::vector<int>& readyIndex,
@@ -972,9 +1076,11 @@ class CV_EXPORTS_W VideoWriter
     /** @overload
     @param filename Name of the output video file.
     @param fourcc 4-character code of codec used to compress the frames. For example,
-    VideoWriter::fourcc('P','I','M','1') is a MPEG-1 codec, VideoWriter::fourcc('M','J','P','G') is a
-    motion-jpeg codec etc. List of codes can be obtained at [Video Codecs by
-    FOURCC](http://www.fourcc.org/codecs.php) page. FFMPEG backend with MP4 container natively uses
+    VideoWriter::fourcc('P','I','M','1') is a MPEG-1 codec, VideoWriter::fourcc('M','J','P','G')
+    is a motion-jpeg codec etc. List of codes can be obtained at
+    [MSDN](https://docs.microsoft.com/en-us/windows/win32/medfound/video-fourccs) page
+    or with this [page](https://fourcc.org/codecs.php)
+    of the fourcc site for a more complete list). FFMPEG backend with MP4 container natively uses
     other values as fourcc code: see [ObjectType](http://mp4ra.org/#/codecs),
     so you may receive a warning message from OpenCV about fourcc code conversion.
     @param fps Framerate of the created video stream.
@@ -989,6 +1095,9 @@ class CV_EXPORTS_W VideoWriter
     - Most codecs are lossy. If you want lossless video file you need to use a lossless codecs
       (eg. FFMPEG FFV1, Huffman HFYU, Lagarith LAGS, etc...)
     - If FFMPEG is enabled, using `codec=0; fps=0;` you can create an uncompressed (raw) video file.
+    - If FFMPEG is used, we allow frames of odd width or height, but in this case we truncate
+      the rightmost column/the bottom row. Probably, this should be handled more elegantly,
+      but some internal functions inside FFMPEG swscale require even width/height.
     */
     CV_WRAP VideoWriter(const String& filename, int fourcc, double fps,
                 Size frameSize, bool isColor = true);
diff --git a/3rdParty/opencv2/videoio/cap_ios.h b/3rdParty/opencv2/videoio/cap_ios.h
deleted file mode 100644
index 4b84805275..0000000000
--- a/3rdParty/opencv2/videoio/cap_ios.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*  For iOS video I/O
- *  by Eduard Feicho on 29/07/12
- *  Copyright 2012. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#import <UIKit/UIKit.h>
-#import <Accelerate/Accelerate.h>
-#import <AVFoundation/AVFoundation.h>
-#import <ImageIO/ImageIO.h>
-#include "opencv2/core.hpp"
-
-//! @addtogroup videoio_ios
-//! @{
-
-/////////////////////////////////////// CvAbstractCamera /////////////////////////////////////
-
-@class CvAbstractCamera;
-
-CV_EXPORTS @interface CvAbstractCamera : NSObject
-{
-    UIDeviceOrientation currentDeviceOrientation;
-
-    BOOL cameraAvailable;
-}
-
-@property (nonatomic, strong) AVCaptureSession* captureSession;
-@property (nonatomic, strong) AVCaptureConnection* videoCaptureConnection;
-
-@property (nonatomic, readonly) BOOL running;
-@property (nonatomic, readonly) BOOL captureSessionLoaded;
-
-@property (nonatomic, assign) int defaultFPS;
-@property (nonatomic, readonly) AVCaptureVideoPreviewLayer *captureVideoPreviewLayer;
-@property (nonatomic, assign) AVCaptureDevicePosition defaultAVCaptureDevicePosition;
-@property (nonatomic, assign) AVCaptureVideoOrientation defaultAVCaptureVideoOrientation;
-@property (nonatomic, assign) BOOL useAVCaptureVideoPreviewLayer;
-@property (nonatomic, strong) NSString *const defaultAVCaptureSessionPreset;
-
-@property (nonatomic, assign) int imageWidth;
-@property (nonatomic, assign) int imageHeight;
-
-@property (nonatomic, strong) UIView* parentView;
-
-- CV_UNUSED(start);
-- CV_UNUSED(stop);
-- CV_UNUSED(switchCameras);
-
-- (id)initWithParentView:(UIView*)parent;
-
-- CV_UNUSED(createCaptureOutput);
-- CV_UNUSED(createVideoPreviewLayer);
-- CV_UNUSED(updateOrientation);
-
-- CV_UNUSED(lockFocus);
-- CV_UNUSED(unlockFocus);
-- CV_UNUSED(lockExposure);
-- CV_UNUSED(unlockExposure);
-- CV_UNUSED(lockBalance);
-- CV_UNUSED(unlockBalance);
-
-@end
-
-///////////////////////////////// CvVideoCamera ///////////////////////////////////////////
-
-@class CvVideoCamera;
-
-CV_EXPORTS @protocol CvVideoCameraDelegate <NSObject>
-
-#ifdef __cplusplus
-// delegate method for processing image frames
-- (void)processImage:(cv::Mat&)image;
-#endif
-
-@end
-
-CV_EXPORTS @interface CvVideoCamera : CvAbstractCamera<AVCaptureVideoDataOutputSampleBufferDelegate>
-{
-    AVCaptureVideoDataOutput *videoDataOutput;
-
-    dispatch_queue_t videoDataOutputQueue;
-    CALayer *customPreviewLayer;
-
-    CMTime lastSampleTime;
-
-}
-
-@property (nonatomic, weak) id<CvVideoCameraDelegate> delegate;
-@property (nonatomic, assign) BOOL grayscaleMode;
-
-@property (nonatomic, assign) BOOL recordVideo;
-@property (nonatomic, assign) BOOL rotateVideo;
-@property (nonatomic, strong) AVAssetWriterInput* recordAssetWriterInput;
-@property (nonatomic, strong) AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
-@property (nonatomic, strong) AVAssetWriter* recordAssetWriter;
-
-- (void)adjustLayoutToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation;
-- CV_UNUSED(layoutPreviewLayer);
-- CV_UNUSED(saveVideo);
-- (NSURL *)videoFileURL;
-- (NSString *)videoFileString;
-
-
-@end
-
-///////////////////////////////// CvPhotoCamera ///////////////////////////////////////////
-
-@class CvPhotoCamera;
-
-CV_EXPORTS @protocol CvPhotoCameraDelegate <NSObject>
-
-- (void)photoCamera:(CvPhotoCamera*)photoCamera capturedImage:(UIImage *)image;
-- (void)photoCameraCancel:(CvPhotoCamera*)photoCamera;
-
-@end
-
-CV_EXPORTS @interface CvPhotoCamera : CvAbstractCamera
-{
-    AVCaptureStillImageOutput *stillImageOutput;
-}
-
-@property (nonatomic, weak) id<CvPhotoCameraDelegate> delegate;
-
-- CV_UNUSED(takePicture);
-
-@end
-
-//! @} videoio_ios
diff --git a/3rdParty/opencv2/videoio/legacy/constants_c.h b/3rdParty/opencv2/videoio/legacy/constants_c.h
index b78aaca065..191cfed7d9 100644
--- a/3rdParty/opencv2/videoio/legacy/constants_c.h
+++ b/3rdParty/opencv2/videoio/legacy/constants_c.h
@@ -5,6 +5,8 @@
 #ifndef OPENCV_VIDEOIO_LEGACY_CONSTANTS_H
 #define OPENCV_VIDEOIO_LEGACY_CONSTANTS_H
 
+#include "opencv2/core/cvdef.h"
+
 enum
 {
     CV_CAP_ANY      =0,     // autodetect
@@ -410,22 +412,6 @@ enum
     CV_CAP_PROP_VIEWFINDER                = 17010  // Enter liveview mode.
 };
 
-//! Macro to construct the fourcc code of the codec. Same as CV_FOURCC()
-#define CV_FOURCC_MACRO(c1, c2, c3, c4) (((c1) & 255) + (((c2) & 255) << 8) + (((c3) & 255) << 16) + (((c4) & 255) << 24))
-
-/** @brief Constructs the fourcc code of the codec function
-
-Simply call it with 4 chars fourcc code like `CV_FOURCC('I', 'Y', 'U', 'V')`
-
-List of codes can be obtained at [Video Codecs by FOURCC](http://www.fourcc.org/codecs.php) page.
-FFMPEG backend with MP4 container natively uses other values as fourcc code:
-see [ObjectType](http://mp4ra.org/#/codecs).
-*/
-CV_INLINE int CV_FOURCC(char c1, char c2, char c3, char c4)
-{
-    return CV_FOURCC_MACRO(c1, c2, c3, c4);
-}
-
 //! (Windows only) Open Codec Selection Dialog
 #define CV_FOURCC_PROMPT -1
 //! (Linux only) Use default codec for specified filename
diff --git a/3rdParty/opencv2/videoio/registry.hpp b/3rdParty/opencv2/videoio/registry.hpp
index 1d5b3a1d03..a60d0e87bc 100644
--- a/3rdParty/opencv2/videoio/registry.hpp
+++ b/3rdParty/opencv2/videoio/registry.hpp
@@ -35,6 +35,9 @@ CV_EXPORTS_W std::vector<VideoCaptureAPIs> getCameraBackends();
 /** @brief Returns list of available backends which works via `cv::VideoCapture(filename)` */
 CV_EXPORTS_W std::vector<VideoCaptureAPIs> getStreamBackends();
 
+/** @brief Returns list of available backends which works via `cv::VideoCapture(buffer)` */
+CV_EXPORTS_W std::vector<VideoCaptureAPIs> getStreamBufferedBackends();
+
 /** @brief Returns list of available backends which works via `cv::VideoWriter()` */
 CV_EXPORTS_W std::vector<VideoCaptureAPIs> getWriterBackends();
 
@@ -58,6 +61,13 @@ CV_EXPORTS_W std::string getStreamBackendPluginVersion(
     CV_OUT int& version_API
 );
 
+/** @brief Returns description and ABI/API version of videoio plugin's buffer capture interface */
+CV_EXPORTS_W std::string getStreamBufferedBackendPluginVersion(
+    VideoCaptureAPIs api,
+    CV_OUT int& version_ABI,
+    CV_OUT int& version_API
+);
+
 /** @brief Returns description and ABI/API version of videoio plugin's writer interface */
 CV_EXPORTS_W std::string getWriterBackendPluginVersion(
     VideoCaptureAPIs api,
diff --git a/3rdPartyBinaries/opencv_world460.dll b/3rdPartyBinaries/opencv_world4110.dll
similarity index 79%
rename from 3rdPartyBinaries/opencv_world460.dll
rename to 3rdPartyBinaries/opencv_world4110.dll
index bca44e4ecb..122c350f87 100644
Binary files a/3rdPartyBinaries/opencv_world460.dll and b/3rdPartyBinaries/opencv_world4110.dll differ
diff --git a/3rdPartyBinaries/opencv_world4110.lib b/3rdPartyBinaries/opencv_world4110.lib
new file mode 100644
index 0000000000..468311ef53
Binary files /dev/null and b/3rdPartyBinaries/opencv_world4110.lib differ
diff --git a/3rdPartyBinaries/opencv_world4110d.lib b/3rdPartyBinaries/opencv_world4110d.lib
new file mode 100644
index 0000000000..bcb3c4ae6f
Binary files /dev/null and b/3rdPartyBinaries/opencv_world4110d.lib differ
diff --git a/3rdPartyBinaries/opencv_world460d.zip b/3rdPartyBinaries/opencv_world4110d.zip
similarity index 71%
rename from 3rdPartyBinaries/opencv_world460d.zip
rename to 3rdPartyBinaries/opencv_world4110d.zip
index 092cf2b076..a2df06ec3a 100644
Binary files a/3rdPartyBinaries/opencv_world460d.zip and b/3rdPartyBinaries/opencv_world4110d.zip differ
diff --git a/3rdPartyBinaries/opencv_world460.lib b/3rdPartyBinaries/opencv_world460.lib
deleted file mode 100644
index 13cf98ea70..0000000000
Binary files a/3rdPartyBinaries/opencv_world460.lib and /dev/null differ
diff --git a/3rdPartyBinaries/opencv_world460d.lib b/3rdPartyBinaries/opencv_world460d.lib
deleted file mode 100644
index cd2f425523..0000000000
Binary files a/3rdPartyBinaries/opencv_world460d.lib and /dev/null differ
diff --git a/SerialPrograms/CMakeLists.txt b/SerialPrograms/CMakeLists.txt
index f5ce9d7733..1dbf4590d6 100644
--- a/SerialPrograms/CMakeLists.txt
+++ b/SerialPrograms/CMakeLists.txt
@@ -2297,10 +2297,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../../Internal/SerialPrograms/Internal0.
     target_sources(SerialPrograms PRIVATE ../../Internal/SerialPrograms/Internal1.cpp)
 endif()
 
-#extract opencv_world460d.dll from archive on Windows Debug builds
+#extract opencv_world4110d.dll from archive on Windows Debug builds
 if (MSVC)
     file(ARCHIVE_EXTRACT
-      INPUT ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world460d.zip
+      INPUT ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world4110d.zip
       DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/
     )
 endif()
@@ -2316,8 +2316,8 @@ target_link_directories(SerialPrograms PRIVATE ../3rdPartyBinaries/)
 if (MSVC)
     add_library(OpenCV_lib IMPORTED UNKNOWN)
     set_target_properties(OpenCV_lib PROPERTIES
-        IMPORTED_LOCATION_RELEASE   ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world460.lib
-        IMPORTED_LOCATION_DEBUG     ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world460d.lib)
+        IMPORTED_LOCATION_RELEASE   ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world4110.lib
+        IMPORTED_LOCATION_DEBUG     ${CMAKE_CURRENT_SOURCE_DIR}/../3rdPartyBinaries/opencv_world4110d.lib)
     set_target_properties(OpenCV_lib PROPERTIES
         MAP_IMPORTED_CONFIG_DEBUG           DEBUG
         MAP_IMPORTED_CONFIG_RELEASE         RELEASE
diff --git a/SerialPrograms/SerialPrograms.pro b/SerialPrograms/SerialPrograms.pro
index 6feea7fe4e..e717e3bd86 100644
--- a/SerialPrograms/SerialPrograms.pro
+++ b/SerialPrograms/SerialPrograms.pro
@@ -59,7 +59,7 @@ win32-msvc{
     DEFINES += PA_DPP
     LIBS += ../3rdPartyBinaries/dpp.lib
 
-    LIBS += ../3rdPartyBinaries/opencv_world460.lib
+    LIBS += ../3rdPartyBinaries/opencv_world4110.lib
 }
 macx{
     QMAKE_CXXFLAGS += -std=c++14