// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level
// directory of this distribution and at http://opencv.org/license.html

#ifndef OPENCV_HAL_VSX_UTILS_HPP
#define OPENCV_HAL_VSX_UTILS_HPP

#include "opencv2/core/cvdef.h"

#ifndef SKIP_INCLUDES
#include <assert.h>
#endif

//! @addtogroup core_utils_vsx
//! @{
#if CV_VSX

#define __VSX_S16__(c, v)                                                      \
  (c) { v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v }
#define __VSX_S8__(c, v)                                                       \
  (c) { v, v, v, v, v, v, v, v }
#define __VSX_S4__(c, v)                                                       \
  (c) { v, v, v, v }
#define __VSX_S2__(c, v)                                                       \
  (c) { v, v }

typedef __vector unsigned char vec_uchar16;
#define vec_uchar16_set(...)                                                   \
  (vec_uchar16) { __VA_ARGS__ }
#define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, (unsigned char)c))
#define vec_uchar16_c(v) ((vec_uchar16)(v))
#define vec_uchar16_z vec_uchar16_sp(0)

typedef __vector signed char vec_char16;
#define vec_char16_set(...)                                                    \
  (vec_char16) { __VA_ARGS__ }
#define vec_char16_sp(c) (__VSX_S16__(vec_char16, (signed char)c))
#define vec_char16_c(v) ((vec_char16)(v))
#define vec_char16_z vec_char16_sp(0)

typedef __vector unsigned short vec_ushort8;
#define vec_ushort8_set(...)                                                   \
  (vec_ushort8) { __VA_ARGS__ }
#define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, (unsigned short)c))
#define vec_ushort8_c(v) ((vec_ushort8)(v))
#define vec_ushort8_z vec_ushort8_sp(0)

typedef __vector signed short vec_short8;
#define vec_short8_set(...)                                                    \
  (vec_short8) { __VA_ARGS__ }
#define vec_short8_sp(c) (__VSX_S8__(vec_short8, (signed short)c))
#define vec_short8_c(v) ((vec_short8)(v))
#define vec_short8_z vec_short8_sp(0)

typedef __vector unsigned int vec_uint4;
#define vec_uint4_set(...)                                                     \
  (vec_uint4) { __VA_ARGS__ }
#define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, (unsigned int)c))
#define vec_uint4_c(v) ((vec_uint4)(v))
#define vec_uint4_z vec_uint4_sp(0)

typedef __vector signed int vec_int4;
#define vec_int4_set(...)                                                      \
  (vec_int4) { __VA_ARGS__ }
#define vec_int4_sp(c) (__VSX_S4__(vec_int4, (signed int)c))
#define vec_int4_c(v) ((vec_int4)(v))
#define vec_int4_z vec_int4_sp(0)

typedef __vector float vec_float4;
#define vec_float4_set(...)                                                    \
  (vec_float4) { __VA_ARGS__ }
#define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
#define vec_float4_c(v) ((vec_float4)(v))
#define vec_float4_z vec_float4_sp(0)

typedef __vector unsigned long long vec_udword2;
#define vec_udword2_set(...)                                                   \
  (vec_udword2) { __VA_ARGS__ }
#define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, (unsigned long long)c))
#define vec_udword2_c(v) ((vec_udword2)(v))
#define vec_udword2_z vec_udword2_sp(0)

typedef __vector signed long long vec_dword2;
#define vec_dword2_set(...)                                                    \
  (vec_dword2) { __VA_ARGS__ }
#define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, (signed long long)c))
#define vec_dword2_c(v) ((vec_dword2)(v))
#define vec_dword2_z vec_dword2_sp(0)

typedef __vector double vec_double2;
#define vec_double2_set(...)                                                   \
  (vec_double2) { __VA_ARGS__ }
#define vec_double2_c(v) ((vec_double2)(v))
#define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
#define vec_double2_z vec_double2_sp(0)

#define vec_bchar16 __vector __bool char
#define vec_bchar16_set(...)                                                   \
  (vec_bchar16) { __VA_ARGS__ }
#define vec_bchar16_c(v) ((vec_bchar16)(v))

#define vec_bshort8 __vector __bool short
#define vec_bshort8_set(...)                                                   \
  (vec_bshort8) { __VA_ARGS__ }
#define vec_bshort8_c(v) ((vec_bshort8)(v))

#define vec_bint4 __vector __bool int
#define vec_bint4_set(...)                                                     \
  (vec_bint4) { __VA_ARGS__ }
#define vec_bint4_c(v) ((vec_bint4)(v))

#define vec_bdword2 __vector __bool long long
#define vec_bdword2_set(...)                                                   \
  (vec_bdword2) { __VA_ARGS__ }
#define vec_bdword2_c(v) ((vec_bdword2)(v))

#define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))

#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)                                     \
  VSX_FINLINE(rt) fnm(const rg &a) { return fn2(a); }

#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)                                     \
  VSX_FINLINE(rt) fnm(const rg &a, const rg &b) { return fn2(a, b); }

/*
 * GCC VSX compatibility
 **/
#if defined(__GNUG__) && !defined(__clang__)

// inline asm helper
#define VSX_IMPL_1RG(rt, rg, opc, fnm)                                         \
  VSX_FINLINE(rt) fnm(const rg &a) {                                           \
    rt rs;                                                                     \
    __asm__ __volatile__(#opc " %x0,%x1" : "=wa"(rs) : "wa"(a));               \
    return rs;                                                                 \
  }

#define VSX_IMPL_1VRG(rt, rg, opc, fnm)                                        \
  VSX_FINLINE(rt) fnm(const rg &a) {                                           \
    rt rs;                                                                     \
    __asm__ __volatile__(#opc " %0,%1" : "=v"(rs) : "v"(a));                   \
    return rs;                                                                 \
  }

#define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm)                                     \
  VSX_FINLINE(rt) fnm(const rg &a, const rg &b) {                              \
    rt rs;                                                                     \
    __asm__ __volatile__(fopc : "=v"(rs) : "v"(a), "v"(b));                    \
    return rs;                                                                 \
  }

#define VSX_IMPL_2VRG(rt, rg, opc, fnm)                                        \
  VSX_IMPL_2VRG_F(rt, rg, #opc " %0,%1,%2", fnm)

#if __GNUG__ < 8

// Support for int4 -> dword2 expanding multiply was added in GCC 8.
#ifdef vec_mule
#undef vec_mule
#endif
#ifdef vec_mulo
#undef vec_mulo
#endif

VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mule, __builtin_vec_mule)
VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mule, __builtin_vec_mule)
VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mule, __builtin_vec_mule)
VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mule, __builtin_vec_mule)
VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mulo, __builtin_vec_mulo)
VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mulo, __builtin_vec_mulo)
VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mulo, __builtin_vec_mulo)
VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)

// dword2 support arrived in ISA 2.07 and GCC 8+
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)

#endif

#if __GNUG__ < 7
// up to GCC 6 vec_mul only supports precisions and llong
#ifdef vec_mul
#undef vec_mul
#endif
/*
 * there's no a direct instruction for supporting 8-bit, 16-bit multiplication
 *in ISA 2.07, XLC Implement it by using instruction "multiply even", "multiply
 *odd" and "permute"
 **/
#define VSX_IMPL_MULH(Tvec, cperm)                                             \
  VSX_FINLINE(Tvec) vec_mul(const Tvec &a, const Tvec &b) {                    \
    static const vec_uchar16 ev_od = {cperm};                                  \
    return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od);        \
  }
#define VSX_IMPL_MULH_P16                                                      \
  0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
#define VSX_IMPL_MULH_P8                                                       \
  0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
// vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
// redirect to GCC builtin vec_mul, since it already supports precisions and
// llong
VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
#endif // __GNUG__ < 7

#if __GNUG__ < 6
/*
 * Instruction "compare greater than or equal" in ISA 2.07 only supports single
 * and double precision.
 * In XLC and new versions of GCC implement integers by using instruction
 *"greater than" and NOR.
 **/
#ifdef vec_cmpge
#undef vec_cmpge
#endif
#ifdef vec_cmple
#undef vec_cmple
#endif
#define vec_cmple(a, b) vec_cmpge(b, a)
#define VSX_IMPL_CMPGE(rt, rg, opc, fnm)                                       \
  VSX_IMPL_2VRG_F(rt, rg, #opc " %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)

VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)

// redirect to GCC builtin cmpge, since it already supports precisions
VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)

// up to gcc5 vec_nor doesn't support bool long long
#undef vec_nor
template <typename T>
VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)

VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2 &a, const vec_bdword2 &b) {
  return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b)));
}

// vec_packs doesn't support double words in gcc4 and old versions of gcc5
#undef vec_packs
VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)

VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
#endif // __GNUG__ < 6

#if __GNUG__ < 5
// vec_xxpermdi in gcc4 missing little-endian supports just like clang
#define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
// same as vec_xxpermdi
#undef vec_vbpermq
VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
#else
#define vec_permi vec_xxpermdi
#endif // __GNUG__ < 5

// shift left double by word immediate
#ifndef vec_sldw
#define vec_sldw __builtin_vsx_xxsldwi
#endif

// vector population count
VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)

// converts between single and double-precision
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)

// converts word and doubleword to double-precision
#undef vec_ctd
VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)

// converts word and doubleword to single-precision
#undef vec_ctf
VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)

// converts single and double precision to signed word
#undef vec_cts
VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)

// converts single and double precision to unsigned word
#undef vec_ctu
VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)

// converts single and double precision to signed doubleword
#undef vec_ctsl
VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)

// converts single and double precision to unsigned doubleword
#undef vec_ctul
VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)

// just in case if GCC doesn't define it
#ifndef vec_xl
#define vec_xl vec_vsx_ld
#define vec_xst vec_vsx_st
#endif

#endif // GCC VSX compatibility

/*
 * CLANG VSX compatibility
 **/
#if defined(__clang__) && !defined(__IBMCPP__)

/*
 * CLANG doesn't support %x<n> in the inline asm template which fixes register
 * number when using any of the register constraints wa, wd, wf
 *
 * For more explanation checkout PowerPC and IBM RS6000 in
 * https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html Also there's
 * already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
 *
 * So we're not able to use inline asm and only use built-in functions that
 * CLANG supports and use __builtin_convertvector if clang missing any of vector
 * conversions built-in functions
 *
 * todo: clang asm template bug is fixed, need to reconsider the current
 * workarounds.
 */

// convert vector helper
#define VSX_IMPL_CONVERT(rt, rg, fnm)                                          \
  VSX_FINLINE(rt) fnm(const rg &a) { return __builtin_convertvector(a, rt); }

#ifndef vec_permi
#if __clang_major__ < 5
// implement vec_permi in a dirty way
#define VSX_IMPL_CLANG_4_PERMI(Tvec)                                           \
  VSX_FINLINE(Tvec)                                                            \
  vec_permi(const Tvec &a, const Tvec &b, unsigned const char c) {             \
    switch (c) {                                                               \
    case 0:                                                                    \
      return vec_mergeh(a, b);                                                 \
    case 1:                                                                    \
      return vec_mergel(vec_mergeh(a, a), b);                                  \
    case 2:                                                                    \
      return vec_mergeh(vec_mergel(a, a), b);                                  \
    default:                                                                   \
      return vec_mergel(a, b);                                                 \
    }                                                                          \
  }
VSX_IMPL_CLANG_4_PERMI(vec_udword2)
VSX_IMPL_CLANG_4_PERMI(vec_dword2)
VSX_IMPL_CLANG_4_PERMI(vec_double2)

// vec_xxsldwi is missing in clang 4
#define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
#else
// vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
#define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
#endif // __clang_major__ < 5
#endif

// shift left double by word immediate
#ifndef vec_sldw
#define vec_sldw vec_xxsldwi
#endif

#if __clang_major__ < 13
// Implement vec_rsqrt since clang only supports vec_rsqrte
#ifndef vec_rsqrt
VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4 &a) {
  return vec_div(vec_float4_sp(1), vec_sqrt(a));
}

VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2 &a) {
  return vec_div(vec_double2_sp(1), vec_sqrt(a));
}
#endif

// vec_promote missing support for doubleword
VSX_FINLINE(vec_dword2) vec_promote(long long a, int b) {
  vec_dword2 ret = vec_dword2_z;
  ret[b & 1] = a;
  return ret;
}

VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b) {
  vec_udword2 ret = vec_udword2_z;
  ret[b & 1] = a;
  return ret;
}
#endif

// vec_popcnt should return unsigned but clang has different thought just like
// gcc in vec_vpopcnt
#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast)                                   \
  VSX_FINLINE(Tvec) vec_popcntu(const Tvec2 &a) { return ucast(vec_popcnt(a)); }
VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
// redirect unsigned types
VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)

// converts between single and double precision
VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)

// converts word and doubleword to double-precision
#ifdef vec_ctd
#undef vec_ctd
#endif
VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)

VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)

// converts word and doubleword to single-precision
#if __clang_major__ > 4
#undef vec_ctf
#endif
VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)

// converts single and double precision to signed word
#if __clang_major__ > 4
#undef vec_cts
#endif
VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)

// converts single and double precision to unsigned word
#if __clang_major__ > 4
#undef vec_ctu
#endif
VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)

// converts single and double precision to signed doubleword
#ifdef vec_ctsl
#undef vec_ctsl
#endif
VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4 &a) {
  return vec_ctsl(vec_cvfo(a));
}

// converts single and double precision to unsigned doubleword
#ifdef vec_ctul
#undef vec_ctul
#endif
VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4 &a) {
  return vec_ctul(vec_cvfo(a));
}

#endif // CLANG VSX compatibility

/*
 * Common GCC, CLANG compatibility
 **/
#if defined(__GNUG__) && !defined(__IBMCPP__)

#ifdef vec_cvf
#undef vec_cvf
#endif

#define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2)                               \
  VSX_FINLINE(rt) fnm(const rg &a) { return fn2(vec_sldw(a, a, 1)); }

VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)

VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)

#define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2)                               \
  VSX_FINLINE(rt) fnm(const rg &a) {                                           \
    rt v4 = fn2(a);                                                            \
    return vec_sldw(v4, v4, 3);                                                \
  }

VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)

VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)

// Only for Eigen!
/*
 * changing behavior of conversion intrinsics for gcc has effect on Eigen
 * so we redefine old behavior again only on gcc, clang
 */
#if !defined(__clang__) || __clang_major__ > 4
// ignoring second arg since Eigen only truncates toward zero
#define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2)                               \
  VSX_FINLINE(rt) fnm(const rg &a, int only_truncate) {                        \
    assert(only_truncate == 0);                                                \
    CV_UNUSED(only_truncate);                                                  \
    return fn2(a);                                                             \
  }
VSX_IMPL_CONV_2VARIANT(vec_int4, vec_float4, vec_cts, vec_cts)
VSX_IMPL_CONV_2VARIANT(vec_uint4, vec_float4, vec_ctu, vec_ctu)
VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4, vec_ctf, vec_ctf)
VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4, vec_ctf, vec_ctf)
// define vec_cts for converting double precision to signed doubleword
// which isn't compatible with xlc but its okay since Eigen only uses it for gcc
VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
#endif // Eigen

#endif // Common GCC, CLANG compatibility

/*
 * XLC VSX compatibility
 **/
#if defined(__IBMCPP__)

// vector population count
#define vec_popcntu vec_popcnt

// overload and redirect with setting second arg to zero
// since we only support conversions without the second arg
#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm)                                      \
  VSX_FINLINE(rt) fnm(const rg &a) { return fnm(a, 0); }

VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)

VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)

VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)

VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)

VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)

VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)

// fixme: implement conversions of odd-numbered elements in a dirty way
// since xlc doesn't support VSX registers operand in inline asm.
#define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2)                                \
  VSX_FINLINE(rt) fnm(const rg &a) { return fn2(vec_sldw(a, a, 3)); }

VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)

VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)

#define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2)                                \
  VSX_FINLINE(rt) fnm(const rg &a) {                                           \
    rt v4 = fn2(a);                                                            \
    return vec_sldw(v4, v4, 1);                                                \
  }

VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)

VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)

#endif // XLC VSX compatibility

// ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
#if defined(__GNUG__) && !defined(__clang__)
#define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
#else // CLANG, XLC
#define VSX_UNUSED(Tvec) Tvec
#endif

// gcc can find his way in casting log int and XLC, CLANG ambiguous
#if defined(__clang__) || defined(__IBMCPP__)
VSX_FINLINE(vec_udword2) vec_splats(uint64 v) {
  return vec_splats((unsigned long long)v);
}

VSX_FINLINE(vec_dword2) vec_splats(int64 v) { return vec_splats((long long)v); }

VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b) {
  return vec_promote((unsigned long long)a, b);
}

VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b) {
  return vec_promote((long long)a, b);
}
#endif

/*
 * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
 * load and set using offset depend on the pointer type
 *
 * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
 * load and set using offset depend on fixed bytes size
 *
 * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
 * so we are using vec_vsx_ld, vec_vsx_st instead
 */

#if defined(__clang__) && !defined(__IBMCPP__)
#define vsx_ldf vec_vsx_ld
#define vsx_stf vec_vsx_st
#else // GCC , XLC
#define vsx_ldf vec_xl
#define vsx_stf vec_xst
#endif

#define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
#define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
#define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)

/*
 * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load
 * and store double words In GCC vec_xl and vec_xst it maps to vec_vsx_ld,
 * vec_vsx_st which doesn't support long long and in CLANG we are using
 * vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned
 * addresses
 *
 * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
 */
#if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64 *p) {
  return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int *)p));
}

VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64 *p) {
  return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int *)p));
}

VSX_FINLINE(void) vsx_st2(const vec_udword2 &vec, long o, uint64 *p) {
  vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int *)p);
}

VSX_FINLINE(void) vsx_st2(const vec_dword2 &vec, long o, int64 *p) {
  vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int *)p);
}
#else // XLC
VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64 *p) {
  return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long *)p);
}

VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64 *p) {
  return vsx_ldf(VSX_OFFSET(o, p), (long long *)p);
}

VSX_FINLINE(void) vsx_st2(const vec_udword2 &vec, long o, uint64 *p) {
  vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long *)p);
}

VSX_FINLINE(void) vsx_st2(const vec_dword2 &vec, long o, int64 *p) {
  vsx_stf(vec, VSX_OFFSET(o, p), (long long *)p);
}
#endif

// Store lower 8 byte
#define vec_st_l8(v, p) *((uint64 *)(p)) = vec_extract(vec_udword2_c(v), 0)

// Store higher 8 byte
#define vec_st_h8(v, p) *((uint64 *)(p)) = vec_extract(vec_udword2_c(v), 1)

// Load 64-bits of integer data to lower part
#define VSX_IMPL_LOAD_L8(Tvec, Tp)                                             \
  VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) {                                   \
    return ((Tvec)vec_promote(*((uint64 *)p), 0));                             \
  }

VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
VSX_IMPL_LOAD_L8(vec_char16, schar)
VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
VSX_IMPL_LOAD_L8(vec_short8, short)
VSX_IMPL_LOAD_L8(vec_uint4, uint)
VSX_IMPL_LOAD_L8(vec_int4, int)
VSX_IMPL_LOAD_L8(vec_float4, float)
VSX_IMPL_LOAD_L8(vec_udword2, uint64)
VSX_IMPL_LOAD_L8(vec_dword2, int64)
VSX_IMPL_LOAD_L8(vec_double2, double)

// logical not
#define vec_not(a) vec_nor(a, a)

// power9 yaya
// not equal
#ifndef vec_cmpne
#define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
#endif

// absolute difference
#ifndef _ARCH_PWR9
#undef vec_absd
#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
#endif

/*
 * Implement vec_unpacklu and vec_unpackhu
 * since vec_unpackl, vec_unpackh only support signed integers
 **/
#define VSX_IMPL_UNPACKU(rt, rg, zero)                                         \
  VSX_FINLINE(rt) vec_unpacklu(const rg &a) {                                  \
    return (rt)(vec_mergel(a, zero));                                          \
  }                                                                            \
  VSX_FINLINE(rt) vec_unpackhu(const rg &a) {                                  \
    return (rt)(vec_mergeh(a, zero));                                          \
  }

VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)

/*
 * Implement vec_mergesqe and vec_mergesqo
 * Merges the sequence values of even and odd elements of two vectors
 */
#define VSX_IMPL_PERM(rt, fnm, ...)                                            \
  VSX_FINLINE(rt) fnm(const rt &a, const rt &b) {                              \
    static const vec_uchar16 perm = {__VA_ARGS__};                             \
    return vec_perm(a, b, perm);                                               \
  }

// 16
#define perm16_mergesqe                                                        \
  0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
#define perm16_mergesqo                                                        \
  1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
// 8
#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
#define perm8_mergesqo                                                         \
  2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
// 4
#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
#define perm4_mergesqo                                                         \
  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
// 2
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)

/*
 * Implement vec_mergesqh and vec_mergesql
 * Merges the sequence most and least significant halves of two vectors
 */
#define VSX_IMPL_MERGESQHL(Tvec)                                               \
  VSX_FINLINE(Tvec) vec_mergesqh(const Tvec &a, const Tvec &b) {               \
    return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b));               \
  }                                                                            \
  VSX_FINLINE(Tvec) vec_mergesql(const Tvec &a, const Tvec &b) {               \
    return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b));               \
  }
VSX_IMPL_MERGESQHL(vec_uchar16)
VSX_IMPL_MERGESQHL(vec_char16)
VSX_IMPL_MERGESQHL(vec_ushort8)
VSX_IMPL_MERGESQHL(vec_short8)
VSX_IMPL_MERGESQHL(vec_uint4)
VSX_IMPL_MERGESQHL(vec_int4)
VSX_IMPL_MERGESQHL(vec_float4)
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)

// 2 and 4 channels interleave for all types except 2 lanes
#define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec)                                       \
  VSX_FINLINE(void) vec_st_interleave(const Tvec &a, const Tvec &b, Tp *ptr) { \
    vsx_stf(vec_mergeh(a, b), 0, ptr);                                         \
    vsx_stf(vec_mergel(a, b), 16, ptr);                                        \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c,               \
                    const Tvec &d, Tp *ptr) {                                  \
    Tvec ac = vec_mergeh(a, c);                                                \
    Tvec bd = vec_mergeh(b, d);                                                \
    vsx_stf(vec_mergeh(ac, bd), 0, ptr);                                       \
    vsx_stf(vec_mergel(ac, bd), 16, ptr);                                      \
    ac = vec_mergel(a, c);                                                     \
    bd = vec_mergel(b, d);                                                     \
    vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                      \
    vsx_stf(vec_mergel(ac, bd), 48, ptr);                                      \
  }
VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
VSX_IMPL_ST_INTERLEAVE(float, vec_float4)

// 2 and 4 channels deinterleave for 16 lanes
#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                    \
  VSX_FINLINE(void) vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b) {     \
    Tvec v0 = vsx_ld(0, ptr);                                                  \
    Tvec v1 = vsx_ld(16, ptr);                                                 \
    a = vec_mergesqe(v0, v1);                                                  \
    b = vec_mergesqo(v0, v1);                                                  \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c, Tvec &d) {     \
    Tvec v0 = vsx_ld(0, ptr);                                                  \
    Tvec v1 = vsx_ld(16, ptr);                                                 \
    Tvec v2 = vsx_ld(32, ptr);                                                 \
    Tvec v3 = vsx_ld(48, ptr);                                                 \
    Tvec m0 = vec_mergesqe(v0, v1);                                            \
    Tvec m1 = vec_mergesqe(v2, v3);                                            \
    a = vec_mergesqe(m0, m1);                                                  \
    c = vec_mergesqo(m0, m1);                                                  \
    m0 = vec_mergesqo(v0, v1);                                                 \
    m1 = vec_mergesqo(v2, v3);                                                 \
    b = vec_mergesqe(m0, m1);                                                  \
    d = vec_mergesqo(m0, m1);                                                  \
  }
VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)

// 2 and 4 channels deinterleave for 8 lanes
#define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec)                                   \
  VSX_FINLINE(void) vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b) {     \
    Tvec v0 = vsx_ld(0, ptr);                                                  \
    Tvec v1 = vsx_ld(8, ptr);                                                  \
    a = vec_mergesqe(v0, v1);                                                  \
    b = vec_mergesqo(v0, v1);                                                  \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c, Tvec &d) {     \
    Tvec v0 = vsx_ld(0, ptr);                                                  \
    Tvec v1 = vsx_ld(8, ptr);                                                  \
    Tvec m0 = vec_mergeh(v0, v1);                                              \
    Tvec m1 = vec_mergel(v0, v1);                                              \
    Tvec ab0 = vec_mergeh(m0, m1);                                             \
    Tvec cd0 = vec_mergel(m0, m1);                                             \
    v0 = vsx_ld(16, ptr);                                                      \
    v1 = vsx_ld(24, ptr);                                                      \
    m0 = vec_mergeh(v0, v1);                                                   \
    m1 = vec_mergel(v0, v1);                                                   \
    Tvec ab1 = vec_mergeh(m0, m1);                                             \
    Tvec cd1 = vec_mergel(m0, m1);                                             \
    a = vec_mergesqh(ab0, ab1);                                                \
    b = vec_mergesql(ab0, ab1);                                                \
    c = vec_mergesqh(cd0, cd1);                                                \
    d = vec_mergesql(cd0, cd1);                                                \
  }
VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)

// 2 and 4 channels deinterleave for 4 lanes
#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                   \
  VSX_FINLINE(void) vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b) {     \
    a = vsx_ld(0, ptr);                                                        \
    b = vsx_ld(4, ptr);                                                        \
    Tvec m0 = vec_mergeh(a, b);                                                \
    Tvec m1 = vec_mergel(a, b);                                                \
    a = vec_mergeh(m0, m1);                                                    \
    b = vec_mergel(m0, m1);                                                    \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c, Tvec &d) {     \
    Tvec v0 = vsx_ld(0, ptr);                                                  \
    Tvec v1 = vsx_ld(4, ptr);                                                  \
    Tvec v2 = vsx_ld(8, ptr);                                                  \
    Tvec v3 = vsx_ld(12, ptr);                                                 \
    Tvec m0 = vec_mergeh(v0, v2);                                              \
    Tvec m1 = vec_mergeh(v1, v3);                                              \
    a = vec_mergeh(m0, m1);                                                    \
    b = vec_mergel(m0, m1);                                                    \
    m0 = vec_mergel(v0, v2);                                                   \
    m1 = vec_mergel(v1, v3);                                                   \
    c = vec_mergeh(m0, m1);                                                    \
    d = vec_mergel(m0, m1);                                                    \
  }
VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)

// 2 and 4 channels interleave and deinterleave for 2 lanes
#define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func)                \
  VSX_FINLINE(void) vec_st_interleave(const Tvec &a, const Tvec &b, Tp *ptr) { \
    st_func(vec_mergeh(a, b), 0, ptr);                                         \
    st_func(vec_mergel(a, b), 2, ptr);                                         \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c,               \
                    const Tvec &d, Tp *ptr) {                                  \
    st_func(vec_mergeh(a, b), 0, ptr);                                         \
    st_func(vec_mergeh(c, d), 2, ptr);                                         \
    st_func(vec_mergel(a, b), 4, ptr);                                         \
    st_func(vec_mergel(c, d), 6, ptr);                                         \
  }                                                                            \
  VSX_FINLINE(void) vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b) {     \
    Tvec m0 = ld_func(0, ptr);                                                 \
    Tvec m1 = ld_func(2, ptr);                                                 \
    a = vec_mergeh(m0, m1);                                                    \
    b = vec_mergel(m0, m1);                                                    \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c, Tvec &d) {     \
    Tvec v0 = ld_func(0, ptr);                                                 \
    Tvec v1 = ld_func(2, ptr);                                                 \
    Tvec v2 = ld_func(4, ptr);                                                 \
    Tvec v3 = ld_func(6, ptr);                                                 \
    a = vec_mergeh(v0, v2);                                                    \
    b = vec_mergel(v0, v2);                                                    \
    c = vec_mergeh(v1, v3);                                                    \
    d = vec_mergel(v1, v3);                                                    \
  }
VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)

/* 3 channels */
#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c, Tp *ptr) {    \
    static const vec_uchar16 a12 = {0, 16, 0,  1, 17, 0,  2, 18,               \
                                    0, 3,  19, 0, 4,  20, 0, 5};               \
    static const vec_uchar16 a123 = {0,  1, 16, 3,  4,  17, 6,  7,             \
                                     18, 9, 10, 19, 12, 13, 20, 15};           \
    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                    \
    static const vec_uchar16 b12 = {21, 0,  6, 22, 0,  7, 23, 0,               \
                                    8,  24, 0, 9,  25, 0, 10, 26};             \
    static const vec_uchar16 b123 = {0, 21, 2,  3,  22, 5,  6,  23,            \
                                     8, 9,  24, 11, 12, 25, 14, 15};           \
    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr);                   \
    static const vec_uchar16 c12 = {0,  11, 27, 0,  12, 28, 0,  13,            \
                                    29, 0,  14, 30, 0,  15, 31, 0};            \
    static const vec_uchar16 c123 = {26, 1,  2,  27, 4,  5,  28, 7,            \
                                     8,  29, 10, 11, 30, 13, 14, 31};          \
    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr);                   \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c) {              \
    Tvec v1 = vsx_ld(0, ptr);                                                  \
    Tvec v2 = vsx_ld(16, ptr);                                                 \
    Tvec v3 = vsx_ld(32, ptr);                                                 \
    static const vec_uchar16 a12_perm = {0,  3,  6,  9, 12, 15, 18, 21,        \
                                         24, 27, 30, 0, 0,  0,  0,  0};        \
    static const vec_uchar16 a123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 10, 17, 20, 23, 26, 29};       \
    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                   \
    static const vec_uchar16 b12_perm = {1,  4,  7,  10, 13, 16, 19, 22,       \
                                         25, 28, 31, 0,  0,  0,  0,  0};       \
    static const vec_uchar16 b123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 10, 18, 21, 24, 27, 30};       \
    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                   \
    static const vec_uchar16 c12_perm = {2,  5,  8, 11, 14, 17, 20, 23,        \
                                         26, 29, 0, 0,  0,  0,  0,  0};        \
    static const vec_uchar16 c123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 16, 19, 22, 25, 28, 31};       \
    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                   \
  }
VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)

#define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec)                                 \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c, Tp *ptr) {    \
    static const vec_uchar16 a12 = {0,  1,  16, 17, 0, 0, 2,  3,               \
                                    18, 19, 0,  0,  4, 5, 20, 21};             \
    static const vec_uchar16 a123 = {0, 1, 2,  3,  16, 17, 6,  7,              \
                                     8, 9, 18, 19, 12, 13, 14, 15};            \
    vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                    \
    static const vec_uchar16 b12 = {0, 0, 6,  7,  22, 23, 0,  0,               \
                                    8, 9, 24, 25, 0,  0,  10, 11};             \
    static const vec_uchar16 b123 = {20, 21, 2,  3,  4,  5,  22, 23,           \
                                     8,  9,  10, 11, 24, 25, 14, 15};          \
    vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr);                    \
    static const vec_uchar16 c12 = {26, 27, 0,  0,  12, 13, 28, 29,            \
                                    0,  0,  14, 15, 30, 31, 0,  0};            \
    static const vec_uchar16 c123 = {0,  1,  26, 27, 4,  5,  6,  7,            \
                                     28, 29, 10, 11, 12, 13, 30, 31};          \
    vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr);                   \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c) {              \
    Tvec v1 = vsx_ld(0, ptr);                                                  \
    Tvec v2 = vsx_ld(8, ptr);                                                  \
    Tvec v3 = vsx_ld(16, ptr);                                                 \
    static const vec_uchar16 a12_perm = {0,  1,  6,  7,  12, 13, 18, 19,       \
                                         24, 25, 30, 31, 0,  0,  0,  0};       \
    static const vec_uchar16 a123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 10, 11, 20, 21, 26, 27};       \
    a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                   \
    static const vec_uchar16 b12_perm = {2,  3,  8, 9, 14, 15, 20, 21,         \
                                         26, 27, 0, 0, 0,  0,  0,  0};         \
    static const vec_uchar16 b123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 16, 17, 22, 23, 28, 29};       \
    b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                   \
    static const vec_uchar16 c12_perm = {4,  5,  10, 11, 16, 17, 22, 23,       \
                                         28, 29, 0,  0,  0,  0,  0,  0};       \
    static const vec_uchar16 c123_perm = {0, 1, 2,  3,  4,  5,  6,  7,         \
                                          8, 9, 18, 19, 24, 25, 30, 31};       \
    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                   \
  }
VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)

#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                 \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c, Tp *ptr) {    \
    Tvec hbc = vec_mergeh(b, c);                                               \
    static const vec_uchar16 ahbc = {0,  1,  2,  3,  16, 17, 18, 19,           \
                                     20, 21, 22, 23, 4,  5,  6,  7};           \
    vsx_st(vec_perm(a, hbc, ahbc), 0, ptr);                                    \
    Tvec lab = vec_mergel(a, b);                                               \
    vsx_st(vec_sld(lab, hbc, 8), 4, ptr);                                      \
    static const vec_uchar16 clab = {8,  9,  10, 11, 24, 25, 26, 27,           \
                                     28, 29, 30, 31, 12, 13, 14, 15};          \
    vsx_st(vec_perm(c, lab, clab), 8, ptr);                                    \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c) {              \
    Tvec v1 = vsx_ld(0, ptr);                                                  \
    Tvec v2 = vsx_ld(4, ptr);                                                  \
    Tvec v3 = vsx_ld(8, ptr);                                                  \
    static const vec_uchar16 flp = {0,  1,  2,  3,  12, 13, 14, 15,            \
                                    16, 17, 18, 19, 28, 29, 30, 31};           \
    a = vec_perm(v1, vec_sld(v3, v2, 8), flp);                                 \
    static const vec_uchar16 flp2 = {28, 29, 30, 31, 0,  1,  2,  3,            \
                                     12, 13, 14, 15, 16, 17, 18, 19};          \
    b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                \
    c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                 \
  }
VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)

#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)               \
  VSX_FINLINE(void)                                                            \
  vec_st_interleave(const Tvec &a, const Tvec &b, const Tvec &c, Tp *ptr) {    \
    st_func(vec_mergeh(a, b), 0, ptr);                                         \
    st_func(vec_permi(c, a, 1), 2, ptr);                                       \
    st_func(vec_mergel(b, c), 4, ptr);                                         \
  }                                                                            \
  VSX_FINLINE(void)                                                            \
  vec_ld_deinterleave(const Tp *ptr, Tvec &a, Tvec &b, Tvec &c) {              \
    Tvec v1 = ld_func(0, ptr);                                                 \
    Tvec v2 = ld_func(2, ptr);                                                 \
    Tvec v3 = ld_func(4, ptr);                                                 \
    a = vec_permi(v1, v2, 1);                                                  \
    b = vec_permi(v1, v3, 2);                                                  \
    c = vec_permi(v2, v3, 1);                                                  \
  }
VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)

#endif // CV_VSX

//! @}

#endif // OPENCV_HAL_VSX_UTILS_HPP