void yuv420sp_to_rgb_fast_asm(const unsigned char* yuv420sp, int w, int h, unsigned char* rgb)
{
const unsigned char* yptr = yuv420sp;
const unsigned char* vuptr = yuv420sp + w * h;
int8x8_t _v128 = vdup_n_s8(128);
int8x8_t _v90 = vdup_n_s8(90);
int8x8_t _v46 = vdup_n_s8(46);
int8x8_t _v22 = vdup_n_s8(22);
int8x8_t _v113 = vdup_n_s8(113);
for (int y=0; y<h; y+=2) {
const unsigned char* yptr0 = yptr;
const unsigned char* yptr1 = yptr + w;
unsigned char* rgb0 = rgb;
unsigned char* rgb1 = rgb + w*3;
#if __ARM_NEON
int nn = w >> 3;
int remain = w - (nn << 3);
#else
int remain = w;
#endif // __ARM_NEON
#if __ARM_NEON
#if __aarch64__
for (; nn>0; nn--) {
int16x8_t _yy0 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr0), 6));
int16x8_t _yy1 = vreinterpretq_s16_u16(vshll_n_u8(vld1_u8(yptr1), 6));
int8x8_t _vvuu = vsub_s8(vreinterpret_s8_u8(vld1_u8(vuptr)), _v128);
int8x8x2_t _vvvvuuuu = vtrn_s8(_vvuu, _vvuu);
int8x8_t _vv = _vvvvuuuu.val[0];
int8x8_t _uu = _vvvvuuuu.val[1];
int16x8_t _r0 = vmlal_s8(_yy0, _vv, _v90);
int16x8_t _g0 = vmlsl_s8(_yy0, _vv, _v46);
_g0 = vmlsl_s8(_g0, _uu, _v22);
int16x8_t _b0 = vmlal_s8(_yy0, _uu, _v113);
int16x8_t _r1 = vmlal_s8(_yy1, _vv, _v90);
int16x8_t _g1 = vmlsl_s8(_yy1, _vv, _v46);
_g1 = vmlsl_s8(_g1, _uu, _v22);
int16x8_t _b1 = vmlal_s8(_yy1, _uu, _v113);
uint8x8x3_t _rgb0;
_rgb0.val[0] = vqshrun_n_s16(_r0, 6);
_rgb0.val[1] = vqshrun_n_s16(_g0, 6);
_rgb0.val[2] = vqshrun_n_s16(_b0, 6);
uint8x8x3_t _rgb1;
_rgb1.val[0] = vqshrun_n_s16(_r1, 6);
_rgb1.val[1] = vqshrun_n_s16(_g1, 6);
_rgb1.val[2] = vqshrun_n_s16(_b1, 6);
vst3_u8(rgb0, _rgb0);
vst3_u8(rgb1, _rgb1);
yptr0 += 8;
yptr1 += 8;
vuptr += 8;
rgb0 += 24;
rgb1 += 24;
}
#else
if (nn > 0) {
asm volatile(
"pld [%3, #128] \n"
"vld1.u8 {d2}, [%3]! \n"
"vsub.s8 d2, d2, %12 \n"
"0: \n"
"pld [%1, #128] \n"
"vld1.u8 {d0}, [%1]! \n"
"pld [%2, #128] \n"
"vld1.u8 {d1}, [%2]! \n"
"vshll.u8 q2, d0, #6 \n"
"vorr d3, d2, d2 \n"
"vshll.u8 q3, d1, #6 \n"
"vorr q9, q2, q2 \n"
"vtrn.s8 d2, d3 \n"
"vorr q11, q3, q3 \n"
"vmlsl.s8 q9, d2, %14 \n"
"vorr q8, q2, q2 \n"
"vmlsl.s8 q11, d2, %14 \n"
"vorr q10, q3, q3 \n"
"vmlal.s8 q8, d2, %13 \n"
"vmlal.s8 q2, d3, %16 \n"
"vmlal.s8 q10, d2, %13 \n"
"vmlsl.s8 q9, d3, %15 \n"
"vmlal.s8 q3, d3, %16 \n"
"vmlsl.s8 q11, d3, %15 \n"
"vqshrun.s16 d24, q8, #6 \n"
"vqshrun.s16 d26, q2, #6 \n"
"vqshrun.s16 d4, q10, #6 \n"
"vqshrun.s16 d25, q9, #6 \n"
"vqshrun.s16 d6, q3, #6 \n"
"vqshrun.s16 d5, q11, #6 \n"
"pld [%3, #128] \n"
"vld1.u8 {d2}, [%3]! \n"
"subs %0, #1 \n"
"vst3.u8 {d24-d26}, [%4]! \n"
"vsub.s8 d2, d2, %12 \n"
"vst3.u8 {d4-d6}, [%5]! \n"
"bne 0b \n"
"sub %3, #8 \n"
: "=r"(nn), // %0
"=r"(yptr0), // %1
"=r"(yptr1), // %2
"=r"(vuptr), // %3
"=r"(rgb0), // %4
"=r"(rgb1) // %5
: "0"(nn),
"1"(yptr0),
"2"(yptr1),
"3"(vuptr),
"4"(rgb0),
"5"(rgb1),
"w"(_v128), // %12
"w"(_v90), // %13
"w"(_v46), // %14
"w"(_v22), // %15
"w"(_v113) // %16
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "d26"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
for (; remain>0; remain--) {
// R = 1.164 * yy + 1.596 * vv
// G = 1.164 * yy - 0.813 * vv - 0.391 * uu
// B = 1.164 * yy + 2.018 * uu
// R = Y + (1.370705 * (V-128))
// G = Y - (0.698001 * (V-128)) - (0.337633 * (U-128))
// B = Y + (1.732446 * (U-128))
// R = ((Y << 6) + 87.72512 * (V-128)) >> 6
// G = ((Y << 6) - 44.672064 * (V-128) - 21.608512 * (U-128)) >> 6
// B = ((Y << 6) + 110.876544 * (U-128)) >> 6
// R = ((Y << 6) + 90 * (V-128)) >> 6
// G = ((Y << 6) - 46 * (V-128) - 22 * (U-128)) >> 6
// B = ((Y << 6) + 113 * (U-128)) >> 6
// R = (yy + 90 * vv) >> 6
// G = (yy - 46 * vv - 22 * uu) >> 6
// B = (yy + 113 * uu) >> 6
int v = vuptr[0] - 128;
int u = vuptr[1] - 128;
int ruv = 90 * v;
int guv = -46 * v + -22 * u;
int buv = 113 * u;
int y00 = yptr0[0] << 6;
rgb0[0] = cv::saturate_cast<uchar>((y00 + ruv) >> 6);
rgb0[1] = cv::saturate_cast<uchar>((y00 + guv) >> 6);
rgb0[2] = cv::saturate_cast<uchar>((y00 + buv) >> 6);
int y01 = yptr0[1] << 6;
rgb0[3] = cv::saturate_cast<uchar>((y01 + ruv) >> 6);
rgb0[4] = cv::saturate_cast<uchar>((y01 + guv) >> 6);
rgb0[5] = cv::saturate_cast<uchar>((y01 + buv) >> 6);
int y10 = yptr1[0] << 6;
rgb1[0] = cv::saturate_cast<uchar>((y10 + ruv) >> 6);
rgb1[1] = cv::saturate_cast<uchar>((y10 + guv) >> 6);
rgb1[2] = cv::saturate_cast<uchar>((y10 + buv) >> 6);
int y11 = yptr1[1] << 6;
rgb1[3] = cv::saturate_cast<uchar>((y11 + ruv) >> 6);
rgb1[4] = cv::saturate_cast<uchar>((y11 + guv) >> 6);
rgb1[5] = cv::saturate_cast<uchar>((y11 + buv) >> 6);
yptr0 += 2;
yptr1 += 2;
vuptr += 2;
rgb0 += 6;
rgb1 += 6;
}
yptr += 2*w;
rgb += 2*3*w;
}
}
#if CV_NEON
template <>
struct RGB2YCrCb_f<float>
{
typedef float channel_type;
RGB2YCrCb_f(int _srccn, int _blueIdx, bool _isCrCb) :
srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
{
static const float coeffs_crb[] = { R2YF, G2YF, B2YF, YCRF, YCBF };
static const float coeffs_yuv[] = { R2YF, G2YF, B2YF, R2VF, B2UF };
memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
if(blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdupq_n_f32(coeffs[0]);
v_c1 = vdupq_n_f32(coeffs[1]);
v_c2 = vdupq_n_f32(coeffs[2]);
v_c3 = vdupq_n_f32(coeffs[3]);
v_c4 = vdupq_n_f32(coeffs[4]);
v_delta = vdupq_n_f32(ColorChannel<float>::half());
}
void operator()(const float * src, float * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
const float delta = ColorChannel<float>::half();
float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
n *= 3;
if (scn == 3)
for ( ; i <= n - 12; i += 12, src += 12)
{
float32x4x3_t v_src = vld3q_f32(src), v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
vst3q_f32(dst + i, v_dst);
}
else
for ( ; i <= n - 12; i += 12, src += 16)
{
float32x4x4_t v_src = vld4q_f32(src);
float32x4x3_t v_dst;
v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
v_dst.val[1+yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
v_dst.val[2-yuvOrder] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
vst3q_f32(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
float Cr = (src[bidx^2] - Y)*C3 + delta;
float Cb = (src[bidx] - Y)*C4 + delta;
dst[i] = Y; dst[i+1+yuvOrder] = Cr; dst[i+2-yuvOrder] = Cb;
}
}
int srccn, blueIdx;
bool isCrCb;
float coeffs[5];
float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
};
#if CV_NEON
template <>
struct RGB2YCrCb_i<uchar>
{
typedef uchar channel_type;
RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
: srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
{
static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
if (blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdup_n_s16(coeffs[0]);
v_c1 = vdup_n_s16(coeffs[1]);
v_c2 = vdup_n_s16(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_c4 = vdupq_n_s32(coeffs[4]);
v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
}
void operator()(const uchar * src, uchar * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint8x8x3_t v_dst;
int16x8x3_t v_src16;
if (scn == 3)
{
uint8x8x3_t v_src = vld3_u8(src);
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
}
else
{
uint8x8x4_t v_src = vld4_u8(src);
v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
}
int16x4x3_t v_src0;
v_src0.val[0] = vget_low_s16(v_src16.val[0]);
v_src0.val[1] = vget_low_s16(v_src16.val[1]);
v_src0.val[2] = vget_low_s16(v_src16.val[2]);
int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
v_src0.val[0] = vget_high_s16(v_src16.val[0]);
v_src0.val[1] = vget_high_s16(v_src16.val[1]);
v_src0.val[2] = vget_high_s16(v_src16.val[2]);
int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
v_dst.val[1+yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
v_dst.val[2-yuvOrder] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
vst3_u8(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
dst[i] = saturate_cast<uchar>(Y);
dst[i+1+yuvOrder] = saturate_cast<uchar>(Cr);
dst[i+2-yuvOrder] = saturate_cast<uchar>(Cb);
}
}
int srccn, blueIdx, coeffs[5];
bool isCrCb;
int16x4_t v_c0, v_c1, v_c2;
int32x4_t v_c3, v_c4, v_delta, v_delta2;
};
template <>
struct RGB2YCrCb_i<ushort>
{
typedef ushort channel_type;
RGB2YCrCb_i(int _srccn, int _blueIdx, bool _isCrCb)
: srccn(_srccn), blueIdx(_blueIdx), isCrCb(_isCrCb)
{
static const int coeffs_crb[] = { R2Y, G2Y, B2Y, YCRI, YCBI };
static const int coeffs_yuv[] = { R2Y, G2Y, B2Y, R2VI, B2UI };
memcpy(coeffs, isCrCb ? coeffs_crb : coeffs_yuv, 5*sizeof(coeffs[0]));
if (blueIdx==0)
std::swap(coeffs[0], coeffs[2]);
v_c0 = vdupq_n_s32(coeffs[0]);
v_c1 = vdupq_n_s32(coeffs[1]);
v_c2 = vdupq_n_s32(coeffs[2]);
v_c3 = vdupq_n_s32(coeffs[3]);
v_c4 = vdupq_n_s32(coeffs[4]);
v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
}
void operator()(const ushort * src, ushort * dst, int n) const
{
int scn = srccn, bidx = blueIdx, i = 0;
int yuvOrder = !isCrCb; //1 if YUV, 0 if YCrCb
int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
n *= 3;
for ( ; i <= n - 24; i += 24, src += scn * 8)
{
uint16x8x3_t v_src, v_dst;
int32x4x3_t v_src0;
if (scn == 3)
v_src = vld3q_u16(src);
else
{
uint16x8x4_t v_src_ = vld4q_u16(src);
v_src.val[0] = v_src_.val[0];
v_src.val[1] = v_src_.val[1];
v_src.val[2] = v_src_.val[2];
}
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
v_dst.val[1+yuvOrder] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
v_dst.val[2-yuvOrder] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
vst3q_u16(dst + i, v_dst);
}
for ( ; i <= n - 12; i += 12, src += scn * 4)
{
uint16x4x3_t v_dst;
int32x4x3_t v_src0;
if (scn == 3)
{
uint16x4x3_t v_src = vld3_u16(src);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
}
else
{
uint16x4x4_t v_src = vld4_u16(src);
v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
}
int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
v_dst.val[0] = vqmovun_s32(v_Y);
v_dst.val[1+yuvOrder] = vqmovun_s32(v_Cr);
v_dst.val[2-yuvOrder] = vqmovun_s32(v_Cb);
vst3_u16(dst + i, v_dst);
}
for ( ; i < n; i += 3, src += scn)
{
int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
dst[i] = saturate_cast<ushort>(Y);
dst[i+1+yuvOrder] = saturate_cast<ushort>(Cr);
dst[i+2-yuvOrder] = saturate_cast<ushort>(Cb);
}
}
int srccn, blueIdx, coeffs[5];
bool isCrCb;
int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
};